Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
 
Loading...
Searching...
No Matches
TensorReductionSycl.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Mehdi Goli Codeplay Software Ltd.
5// Ralph Potter Codeplay Software Ltd.
6// Luke Iwanski Codeplay Software Ltd.
7// Contact: <eigen@codeplay.com>
8//
9// This Source Code Form is subject to the terms of the Mozilla
10// Public License v. 2.0. If a copy of the MPL was not distributed
11// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12
13/*****************************************************************
14 * TensorReductionSycl.h
15 *
16 * \brief:
17 * This is the specialization of the reduction operation. Two phase reduction approach
18 * is used since the GPU does not have Global Synchronization for global memory among
19 * different work-group/thread block. To solve the problem, we need to create two kernels
20 * to reduce the data, where the first kernel reduce the data locally and each local
21 * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
22 * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
23 * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
24 * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
25 *
26 *****************************************************************/
27
28#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
30namespace Eigen {
31namespace TensorSycl {
32namespace internal {
33
34template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
35struct OpDefiner {
36 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
37 typedef Op type;
38 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
39
40 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
41 const Index &) {
42 return accumulator;
43 }
44};
45
46template <typename CoeffReturnType, typename Index>
47struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
48 typedef Eigen::internal::SumReducer<CoeffReturnType> type;
49 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
50 return type();
51 }
52
53 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
54 const Index &scale) {
55 ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
56 return quotient_op(accumulator, CoeffReturnType(scale));
57 }
58};
59
60template <typename CoeffReturnType, typename Index>
61struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
62 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
63 typedef Eigen::internal::SumReducer<CoeffReturnType> type;
64 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
65 return type();
66 }
67
68 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
69 const Index &scale) {
70 return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
71 }
72};
73
74template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
75 Index local_range>
76struct SecondStepFullReducer {
77 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
78 LocalAccessor;
79 typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
80 typedef typename OpDef::type Op;
81 LocalAccessor scratch;
82 InputAccessor aI;
83 OutputAccessor outAcc;
84 Op op;
85 SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
86 : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
87
88 void operator()(cl::sycl::nd_item<1> itemID) {
89 // Our empirical research shows that the best performance will be achieved
90 // when there is only one element per thread to reduce in the second step.
91 // in this step the second step reduction time is almost negligible.
92 // Hence, in the second step of reduction the input size is fixed to the
93 // local size, thus, there is only one element read per thread. The
94 // algorithm must be changed if the number of reduce per thread in the
95 // second step is greater than 1. Otherwise, the result will be wrong.
96 const Index localid = itemID.get_local_id(0);
97 auto aInPtr = aI.get_pointer() + localid;
98 auto aOutPtr = outAcc.get_pointer();
99 CoeffReturnType *scratchptr = scratch.get_pointer();
100 CoeffReturnType accumulator = *aInPtr;
101
102 scratchptr[localid] = op.finalize(accumulator);
103 for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
104 itemID.barrier(cl::sycl::access::fence_space::local_space);
105 if (localid < offset) {
106 op.reduce(scratchptr[localid + offset], &accumulator);
107 scratchptr[localid] = op.finalize(accumulator);
108 }
109 }
110 if (localid == 0) *aOutPtr = op.finalize(accumulator);
111 }
112};
113
114// Full reduction first phase. In this version the vectorization is true and the reduction accept
115// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
116template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
117class FullReductionKernelFunctor {
118 public:
119 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
120 typedef typename Evaluator::Index Index;
121 typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
122 (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
123 OpDef;
124
125 typedef typename OpDef::type Op;
126 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
127 typedef typename Evaluator::PacketReturnType PacketReturnType;
128 typedef
129 typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
130 PacketReturnType, CoeffReturnType>::type OutType;
131 typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
132 LocalAccessor;
133 LocalAccessor scratch;
134 Evaluator evaluator;
135 EvaluatorPointerType final_output;
136 Index rng;
137 Op op;
138
139 FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
140 Index rng_, OpType op_)
141 : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
142
143 void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
144
145 template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
146 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
147 const cl::sycl::nd_item<1> &itemID) {
148 auto output_ptr = final_output.get_pointer();
149 Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
150 Index globalid = itemID.get_global_id(0);
151 Index localid = itemID.get_local_id(0);
152 Index step = Evaluator::PacketSize * itemID.get_global_range(0);
153 Index start = Evaluator::PacketSize * globalid;
154 // vectorizable parts
155 PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
156 for (Index i = start; i < VectorizedRange; i += step) {
157 op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
158 }
159 globalid += VectorizedRange;
160 // non vectorizable parts
161 for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
162 op.template reducePacket<PacketReturnType>(
163 ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
164 evaluator.impl().coeff(i), op.initialize()),
165 &packetAccumulator);
166 }
167 scratch[localid] = packetAccumulator =
168 OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
169 // reduction parts // Local size is always power of 2
170 EIGEN_UNROLL_LOOP
171 for (Index offset = local_range / 2; offset > 0; offset /= 2) {
172 itemID.barrier(cl::sycl::access::fence_space::local_space);
173 if (localid < offset) {
174 op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
175 scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
176 }
177 }
178 if (localid == 0) {
179 output_ptr[itemID.get_group(0)] =
180 op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
181 }
182 }
183
184 template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
185 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
186 const cl::sycl::nd_item<1> &itemID) {
187 auto output_ptr = final_output.get_pointer();
188 Index globalid = itemID.get_global_id(0);
189 Index localid = itemID.get_local_id(0);
190 // vectorizable parts
191 CoeffReturnType accumulator = op.initialize();
192 // non vectorizable parts
193 for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
194 op.reduce(evaluator.impl().coeff(i), &accumulator);
195 }
196 scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
197
198 // reduction parts. the local size is always power of 2
199 EIGEN_UNROLL_LOOP
200 for (Index offset = local_range / 2; offset > 0; offset /= 2) {
201 itemID.barrier(cl::sycl::access::fence_space::local_space);
202 if (localid < offset) {
203 op.reduce(scratch[localid + offset], &accumulator);
204 scratch[localid] = op.finalize(accumulator);
205 }
206 }
207 if (localid == 0) {
208 output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
209 }
210 }
211};
212
213template <typename Evaluator, typename OpType>
214class GenericNondeterministicReducer {
215 public:
216 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
217 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
218 typedef typename Evaluator::Index Index;
219 typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
220 typedef typename OpDef::type Op;
221 template <typename Scratch>
222 GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
223 Index range_, Index num_values_to_reduce_)
224 : evaluator(evaluator_),
225 output_accessor(output_accessor_),
226 functor(OpDef::get_op(functor_)),
227 range(range_),
228 num_values_to_reduce(num_values_to_reduce_) {}
229
230 void operator()(cl::sycl::nd_item<1> itemID) {
231 auto output_accessor_ptr = output_accessor.get_pointer();
233 Index globalid = static_cast<Index>(itemID.get_global_linear_id());
234 if (globalid < range) {
235 CoeffReturnType accum = functor.initialize();
236 Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
237 evaluator, evaluator.firstInput(globalid), functor, &accum);
238 output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
239 }
240 }
241
242 private:
243 Evaluator evaluator;
244 EvaluatorPointerType output_accessor;
245 Op functor;
246 Index range;
247 Index num_values_to_reduce;
248};
249
250enum class reduction_dim { inner_most, outer_most };
251// default is preserver
252template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
253struct PartialReductionKernel {
254 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
255 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
256 typedef typename Evaluator::Index Index;
257 typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
258 typedef typename OpDef::type Op;
259 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
260 ScratchAcc;
261 ScratchAcc scratch;
262 Evaluator evaluator;
263 EvaluatorPointerType output_accessor;
264 Op op;
265 const Index preserve_elements_num_groups;
266 const Index reduce_elements_num_groups;
267 const Index num_coeffs_to_preserve;
268 const Index num_coeffs_to_reduce;
269
270 PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
271 const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
272 const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
273 : scratch(scratch_),
274 evaluator(evaluator_),
275 output_accessor(output_accessor_),
276 op(OpDef::get_op(op_)),
277 preserve_elements_num_groups(preserve_elements_num_groups_),
278 reduce_elements_num_groups(reduce_elements_num_groups_),
279 num_coeffs_to_preserve(num_coeffs_to_preserve_),
280 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
281
282 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
283 CoeffReturnType &accumulator) {
284 if (globalPId >= num_coeffs_to_preserve) {
285 return;
286 }
287 Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
288 : globalRId + (globalPId * num_coeffs_to_reduce);
289 Index localOffset = globalRId;
290
291 const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
292 const Index per_thread_global_stride =
293 rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
294 for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
295 op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
296 localOffset += per_thread_local_stride;
297 global_offset += per_thread_global_stride;
298 }
299 }
300 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
301 const Index linearLocalThreadId = itemID.get_local_id(0);
302 Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
303 : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
304 Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
305 : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
306 const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
307 : itemID.get_group(0) / reduce_elements_num_groups;
308 const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
309 : itemID.get_group(0) % reduce_elements_num_groups;
310
311 Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
312 const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
313 auto scratchPtr = scratch.get_pointer().get();
314 auto outPtr =
315 output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
316 CoeffReturnType accumulator = op.initialize();
317
318 element_wise_reduce(globalRId, globalPId, accumulator);
319
320 accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
321 scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
322 accumulator;
323 if (rt == reduction_dim::inner_most) {
324 pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
325 rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
326 globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
327 }
328
329 /* Apply the reduction operation between the current local
330 * id and the one on the other half of the vector. */
331 auto out_scratch_ptr =
332 scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
333 itemID.barrier(cl::sycl::access::fence_space::local_space);
334 if (rt == reduction_dim::inner_most) {
335 accumulator = *out_scratch_ptr;
336 }
337 // The Local LocalThreadSizeR is always power of 2
338 EIGEN_UNROLL_LOOP
339 for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
340 if (rLocalThreadId < offset) {
341 op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
342 // The result has already been divided for mean reducer in the
343 // previous reduction so no need to divide furthermore
344 *out_scratch_ptr = op.finalize(accumulator);
345 }
346 /* All threads collectively read from global memory into local.
347 * The barrier ensures all threads' IO is resolved before
348 * execution continues (strictly speaking, all threads within
349 * a single work-group - there is no co-ordination between
350 * work-groups, only work-items). */
351 itemID.barrier(cl::sycl::access::fence_space::local_space);
352 }
353
354 if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
355 outPtr[globalPId] = op.finalize(accumulator);
356 }
357 }
358};
359
360template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
361struct SecondStepPartialReduction {
362 typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
363 typedef typename OpDef::type Op;
364 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
365 ScratchAccessor;
366 InputAccessor input_accessor;
367 OutputAccessor output_accessor;
368 Op op;
369 const Index num_coeffs_to_preserve;
370 const Index num_coeffs_to_reduce;
371
372 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
373 OutputAccessor output_accessor_, OpType op_,
374 const Index num_coeffs_to_preserve_,
375 const Index num_coeffs_to_reduce_)
376 : input_accessor(input_accessor_),
377 output_accessor(output_accessor_),
378 op(OpDef::get_op(op_)),
379 num_coeffs_to_preserve(num_coeffs_to_preserve_),
380 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
381
382 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
383 const Index globalId = itemID.get_global_id(0);
384
385 if (globalId >= num_coeffs_to_preserve) return;
386
387 auto in_ptr = input_accessor.get_pointer() + globalId;
388
389 OutScalar accumulator = op.initialize();
390// num_coeffs_to_reduce is not bigger that 256
391 for (Index i = 0; i < num_coeffs_to_reduce; i++) {
392 op.reduce(*in_ptr, &accumulator);
393 in_ptr += num_coeffs_to_preserve;
394 }
395 output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
396 }
397}; // namespace internal
398
399template <typename Index, Index LTP, Index LTR, bool BC_>
400struct ReductionPannel {
401 static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
402 static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
403 static EIGEN_CONSTEXPR bool BC = BC_;
404};
405
406template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
407struct PartialReducerLauncher {
408 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
409 typedef typename Self::CoeffReturnType CoeffReturnType;
410 typedef typename Self::Storage Storage;
411 typedef typename Self::Index Index;
412 typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
413 PannelParameters;
414
415 typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
416
417 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
418 Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
419 Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
420
421 // getPowerOfTwo makes sure local range is power of 2 and <=
422 // maxSyclThreadPerBlock this will help us to avoid extra check on the
423 // kernel
424 static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
425 (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
426 "The Local thread size must be a power of 2 for the reduction "
427 "operation");
428
429 EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
430 // In this step, we force the code not to be more than 2-step reduction:
431 // Our empirical research shows that if each thread reduces at least 64
432 // elemnts individually, we get better performance. However, this can change
433 // on different platforms. In this step we force the code not to be
434 // morthan step reduction: Our empirical research shows that for inner_most
435 // dim reducer, it is better to have 8 group in a reduce dimension for sizes
436 // > 1024 to achieve the best performance.
437 const Index reductionPerThread = 64;
438 Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
439 const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
440 Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
441 const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
442 const Index globalRange = pNumGroups * rNumGroups * localRange;
443
444 EIGEN_CONSTEXPR Index scratchSize =
445 PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
446 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
447 if (rNumGroups > 1) {
448 CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
449 dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
450 EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
451 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
452 self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
453 num_coeffs_to_reduce);
454
455 typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
456 SecondStepPartialReductionKernel;
457
458 dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
459 temp_accessor, output,
460 cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
461 reducer, num_coeffs_to_preserve, rNumGroups);
462
463 self.device().deallocate_temp(temp_pointer);
464 } else {
465 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
466 self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
467 num_coeffs_to_reduce);
468 }
469 return false;
470 }
471};
472} // namespace internal
473} // namespace TensorSycl
474
475namespace internal {
476
477template <typename Self, typename Op, bool Vectorizable>
478struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
479 typedef typename Self::CoeffReturnType CoeffReturnType;
480 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
481 static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
482 static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
483 static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
484 typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
485 static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
486 (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
487 "The Local thread size must be a power of 2 for the reduction "
488 "operation");
489 EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
490
491 typename Self::Index inputSize = self.impl().dimensions().TotalSize();
492 // In this step we force the code not to be more than 2-step reduction:
493 // Our empirical research shows that if each thread reduces at least 512
494 // elemnts individually, we get better performance.
495 const Index reductionPerThread = 2048;
496 // const Index num_work_group =
497 Index reductionGroup = dev.getPowerOfTwo(
498 (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
499 const Index num_work_group = std::min(reductionGroup, local_range);
500 // 1
501 // ? local_range
502 // : 1);
503 const Index global_range = num_work_group * local_range;
504
505 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
506 typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
507 if (num_work_group > 1) {
508 CoeffReturnType *temp_pointer =
509 static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
510 typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
511 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
512 local_range, inputSize, reducer);
513
514 typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
515 EvaluatorPointerType, Index, local_range>
516 GenericRKernel;
517 dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
518 tmp_global_accessor, data,
519 cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
520 reducer);
521
522 dev.deallocate_temp(temp_pointer);
523 } else {
524 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
525 reducer);
526 }
527 }
528};
529// vectorizable inner_most most dim preserver
530// col reduction
531template <typename Self, typename Op>
532struct OuterReducer<Self, Op, Eigen::SyclDevice> {
533 static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
534
535 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
536 typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
537 typename Self::Index num_coeffs_to_preserve) {
538 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
539 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
540 num_coeffs_to_reduce,
541 num_coeffs_to_preserve);
542 }
543};
544// row reduction
545template <typename Self, typename Op>
546struct InnerReducer<Self, Op, Eigen::SyclDevice> {
547 static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
548
549 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
550 typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
551 typename Self::Index num_coeffs_to_preserve) {
552 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
553 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
554 num_coeffs_to_reduce,
555 num_coeffs_to_preserve);
556 }
557};
558
559// ArmgMax uses this kernel for partial reduction//
560// TODO(@mehdi.goli) come up with a better kernel
561// generic partial reduction
562template <typename Self, typename Op>
563struct GenericReducer<Self, Op, Eigen::SyclDevice> {
564 static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
565 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
566 typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
567 typename Self::Index num_coeffs_to_preserve) {
568 typename Self::Index range, GRange, tileSize;
569 dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
570
571 dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
572 TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
573 self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
574 reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
575 return false;
576 }
577};
578
579} // namespace internal
580} // namespace Eigen
581
582#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index