Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
 
Loading...
Searching...
No Matches
TensorDeviceSycl.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Mehdi Goli Codeplay Software Ltd.
5// Ralph Potter Codeplay Software Ltd.
6// Luke Iwanski Codeplay Software Ltd.
7// Contact: <eigen@codeplay.com>
8// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9
10//
11// This Source Code Form is subject to the terms of the Mozilla
12// Public License v. 2.0. If a copy of the MPL was not distributed
13// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14
15#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
16#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
17#include <unordered_set>
18
19namespace Eigen {
20
21namespace TensorSycl {
22namespace internal {
23
25struct SyclDeviceInfo {
26 SyclDeviceInfo(cl::sycl::queue queue)
27 : local_mem_type(
28 queue.get_device()
29 .template get_info<cl::sycl::info::device::local_mem_type>()),
30 max_work_item_sizes(
31 queue.get_device()
32 .template get_info<
33 cl::sycl::info::device::max_work_item_sizes>()),
34 max_mem_alloc_size(
35 queue.get_device()
36 .template get_info<
37 cl::sycl::info::device::max_mem_alloc_size>()),
38 max_compute_units(queue.get_device()
39 .template get_info<
40 cl::sycl::info::device::max_compute_units>()),
41 max_work_group_size(
42 queue.get_device()
43 .template get_info<
44 cl::sycl::info::device::max_work_group_size>()),
45 local_mem_size(
46 queue.get_device()
47 .template get_info<cl::sycl::info::device::local_mem_size>()),
48 platform_name(queue.get_device()
49 .get_platform()
50 .template get_info<cl::sycl::info::platform::name>()),
51 device_name(queue.get_device()
52 .template get_info<cl::sycl::info::device::name>()),
53 device_vendor(
54 queue.get_device()
55 .template get_info<cl::sycl::info::device::vendor>()) {}
56
57 cl::sycl::info::local_mem_type local_mem_type;
58 cl::sycl::id<3> max_work_item_sizes;
59 unsigned long max_mem_alloc_size;
60 unsigned long max_compute_units;
61 unsigned long max_work_group_size;
62 size_t local_mem_size;
63 std::string platform_name;
64 std::string device_name;
65 std::string device_vendor;
66};
67
68} // end namespace internal
69} // end namespace TensorSycl
70
71typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
72// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
73// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
74// TensorFlow via the Eigen SYCL Backend.
75EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
76 -> decltype(cl::sycl::device::get_devices()) {
77#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
78 return {cl::sycl::device(cl::sycl::default_selector())};
79#else
80 std::vector<cl::sycl::device> supported_devices;
81 auto platform_list = cl::sycl::platform::get_platforms();
82 for (const auto &platform : platform_list) {
83 auto device_list = platform.get_devices();
84 auto platform_name =
85 platform.template get_info<cl::sycl::info::platform::name>();
86 std::transform(platform_name.begin(), platform_name.end(),
87 platform_name.begin(), ::tolower);
88 for (const auto &device : device_list) {
89 auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
90 std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
91 bool unsupported_condition =
92 (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
93 vendor.find("apu") == std::string::npos) ||
94 (platform_name.find("experimental") != std::string::npos) ||
95 device.is_host();
96 if (!unsupported_condition) {
97 supported_devices.push_back(device);
98 }
99 }
100 }
101 return supported_devices;
102#endif
103}
104
105class QueueInterface {
106 public:
108 template <typename DeviceOrSelector>
109 explicit QueueInterface(
110 const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
111 unsigned num_threads = std::thread::hardware_concurrency())
112 : m_queue(dev_or_sel, handler),
113#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
114 m_prog(m_queue.get_context(), get_sycl_supported_devices()),
115#endif
116 m_thread_pool(num_threads),
117 m_device_info(m_queue) {
118#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
119 m_prog.build_with_kernel_type<DeviceOrSelector>();
120 auto f = [&](cl::sycl::handler &cgh) {
121 cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
122 [=]() {})
123 };
124 EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
125#endif
126 }
127
128 template <typename DeviceOrSelector>
129 explicit QueueInterface(
130 const DeviceOrSelector &dev_or_sel,
131 unsigned num_threads = std::thread::hardware_concurrency())
132 : QueueInterface(dev_or_sel,
133 [this](cl::sycl::exception_list l) {
134 this->exception_caught_ = this->sycl_async_handler(l);
135 },
136 num_threads) {}
137
138#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
139 EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
140#endif
141
143 EIGEN_STRONG_INLINE void *attach_buffer(
144 cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
145 std::lock_guard<std::mutex> lock(pmapper_mutex_);
146 return static_cast<void *>(pMapper.add_pointer(buf));
147 }
148
150 EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
151 std::lock_guard<std::mutex> lock(pmapper_mutex_);
152 TensorSycl::internal::SYCLfree<false>(p, pMapper);
153 }
154
163 EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
164#if EIGEN_MAX_ALIGN_BYTES > 0
165 size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
166 if (align > 0) {
167 num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
168 }
169#endif
170 std::lock_guard<std::mutex> lock(pmapper_mutex_);
171 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
172 }
173
174 EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
175#if EIGEN_MAX_ALIGN_BYTES > 0
176 size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
177 if (align > 0) {
178 num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
179 }
180#endif
181 std::lock_guard<std::mutex> lock(pmapper_mutex_);
182#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
183 if (scratch_buffers.empty()) {
184 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
185 ;
186 } else {
187 for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
188 auto buff = pMapper.get_buffer(*it);
189 if (buff.get_size() >= num_bytes) {
190 auto ptr = *it;
191 scratch_buffers.erase(it);
192 return ptr;
193 } else {
194 ++it;
195 }
196 }
197 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
198 }
199#else
200 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
201#endif
202 }
203 template <typename data_t>
204 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
205 cl::sycl::access::mode::read_write, data_t>
206 get(data_t *data) const {
207 return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
208 }
209 template <typename data_t>
210 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
211 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
212 data_t>
213 data) const {
214 return static_cast<data_t *>(data.get_virtual_pointer());
215 }
216
217 EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
218 std::lock_guard<std::mutex> lock(pmapper_mutex_);
219#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
220 scratch_buffers.insert(p);
221#else
222 TensorSycl::internal::SYCLfree(p, pMapper);
223#endif
224 }
225 template <cl::sycl::access::mode AcMd, typename T>
226 EIGEN_STRONG_INLINE void deallocate_temp(
227 const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
228 deallocate_temp(p.get_virtual_pointer());
229 }
230
233 EIGEN_STRONG_INLINE void deallocate(void *p) const {
234 std::lock_guard<std::mutex> lock(pmapper_mutex_);
235 TensorSycl::internal::SYCLfree(p, pMapper);
236 }
237
238 EIGEN_STRONG_INLINE void deallocate_all() const {
239 std::lock_guard<std::mutex> lock(pmapper_mutex_);
240 TensorSycl::internal::SYCLfreeAll(pMapper);
241#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
242 scratch_buffers.clear();
243#endif
244 }
245
250 EIGEN_STRONG_INLINE void memcpyHostToDevice(
251 void *dst, const void *src, size_t n,
252 std::function<void()> callback) const {
253 static const auto write_mode = cl::sycl::access::mode::discard_write;
254 static const auto global_access = cl::sycl::access::target::global_buffer;
255 typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
256 write_accessor;
257 if (n == 0) {
258 if (callback) callback();
259 return;
260 }
261 n /= sizeof(buffer_scalar_t);
262 auto f = [&](cl::sycl::handler &cgh) {
263 write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
264 buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
265 auto non_deleter = [](buffer_scalar_t const *) {};
266 std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
267 cgh.copy(s_ptr, dst_acc);
268 };
269 cl::sycl::event e;
270 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
271 synchronize_and_callback(e, callback);
272 }
273
278 EIGEN_STRONG_INLINE void memcpyDeviceToHost(
279 void *dst, const void *src, size_t n,
280 std::function<void()> callback) const {
281 static const auto read_mode = cl::sycl::access::mode::read;
282 static const auto global_access = cl::sycl::access::target::global_buffer;
283 typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
284 read_accessor;
285 if (n == 0) {
286 if (callback) callback();
287 return;
288 }
289 n /= sizeof(buffer_scalar_t);
290 auto f = [&](cl::sycl::handler &cgh) {
291 read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
292 buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
293 auto non_deleter = [](buffer_scalar_t *) {};
294 std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
295 cgh.copy(src_acc, s_ptr);
296 };
297 cl::sycl::event e;
298 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
299 synchronize_and_callback(e, callback);
300 }
301
305 EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
306 static const auto read_mode = cl::sycl::access::mode::read;
307 static const auto write_mode = cl::sycl::access::mode::discard_write;
308 if (n == 0) {
309 return;
310 }
311 n /= sizeof(buffer_scalar_t);
312 auto f = [&](cl::sycl::handler &cgh) {
313 auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
314 auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
315 cgh.copy(src_acc, dst_acc);
316 };
317 cl::sycl::event e;
318 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
319 async_synchronize(e);
320 }
321
325 EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
326 static const auto write_mode = cl::sycl::access::mode::discard_write;
327 if (n == 0) {
328 return;
329 }
330 n /= sizeof(buffer_scalar_t);
331 auto f = [&](cl::sycl::handler &cgh) {
332 auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
333 // The cast to uint8_t is here to match the behaviour of the standard
334 // memset. The cast to buffer_scalar_t is needed to match the type of the
335 // accessor (in case buffer_scalar_t is not uint8_t)
336 cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
337 };
338 cl::sycl::event e;
339 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
340 async_synchronize(e);
341 }
342
350 template <cl::sycl::access::mode AcMd, typename T>
351 EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
352 get_range_accessor(const void *ptr) const {
353 static const auto global_access = cl::sycl::access::target::global_buffer;
354 static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
355 typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
356 typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
357
358 std::lock_guard<std::mutex> lock(pmapper_mutex_);
359
360 auto original_buffer = pMapper.get_buffer(ptr);
361 const ptrdiff_t offset = pMapper.get_offset(ptr);
362 const ptrdiff_t typed_offset = offset / sizeof(T);
363 eigen_assert(typed_offset >= 0);
364 const auto typed_size = original_buffer.get_size() / sizeof(T);
365 auto buffer = original_buffer.template reinterpret<
366 typename Eigen::internal::remove_const<T>::type>(
367 cl::sycl::range<1>(typed_size));
368 const ptrdiff_t size = buffer.get_count() - typed_offset;
369 eigen_assert(size >= 0);
370 typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
371 1, AcMd, global_access, is_place_holder>
372 placeholder_accessor_t;
373 const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
374 return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
375 cl::sycl::id<1>(typed_offset)),
376 static_cast<size_t>(typed_offset),
377 reinterpret_cast<std::intptr_t>(start_ptr));
378 }
379
382 template <cl::sycl::access::mode AcMd, typename Index>
383 EIGEN_STRONG_INLINE cl::sycl::accessor<
384 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
385 get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
386 const Index n_bytes) const {
387 static const auto global_access = cl::sycl::access::target::global_buffer;
388 eigen_assert(n_bytes >= 0);
389 std::lock_guard<std::mutex> lock(pmapper_mutex_);
390 auto buffer = pMapper.get_buffer(ptr);
391 const ptrdiff_t offset = pMapper.get_offset(ptr);
392 eigen_assert(offset >= 0);
393 eigen_assert(offset + n_bytes <= buffer.get_size());
394 return buffer.template get_access<AcMd, global_access>(
395 cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
396 }
397
402 template <cl::sycl::access::mode AcMd>
403 EIGEN_STRONG_INLINE cl::sycl::accessor<
404 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
405 get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
406 std::lock_guard<std::mutex> lock(pmapper_mutex_);
407 return pMapper.get_buffer(ptr)
408 .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
409 cgh);
410 }
411
412 EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
413 const void *ptr) const {
414 std::lock_guard<std::mutex> lock(pmapper_mutex_);
415 return pMapper.get_buffer(ptr);
416 }
417
418 EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
419 std::lock_guard<std::mutex> lock(pmapper_mutex_);
420 return pMapper.get_offset(ptr);
421 }
422
423 template <typename OutScalar, typename sycl_kernel, typename Lhs,
424 typename Rhs, typename OutPtr, typename Range, typename Index,
425 typename... T>
426 EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
427 const Rhs &rhs, OutPtr outptr,
428 Range thread_range,
429 Index scratchSize,
430 T... var) const {
431 auto kernel_functor = [=](cl::sycl::handler &cgh) {
432 // binding the placeholder accessors to a commandgroup handler
433 lhs.bind(cgh);
434 rhs.bind(cgh);
435 outptr.bind(cgh);
436 typedef cl::sycl::accessor<OutScalar, 1,
437 cl::sycl::access::mode::read_write,
438 cl::sycl::access::target::local>
439 LocalAccessor;
440
441 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
442 cgh.parallel_for(
443#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
444 program().template get_kernel<sycl_kernel>(),
445#endif
446 thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
447 };
448 cl::sycl::event e;
449 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
450 async_synchronize(e);
451 }
452
453 template <typename OutScalar, typename sycl_kernel, typename InPtr,
454 typename OutPtr, typename Range, typename Index, typename... T>
455 EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
456 OutPtr &outptr,
457 Range thread_range,
458 Index scratchSize,
459 T... var) const {
460 auto kernel_functor = [=](cl::sycl::handler &cgh) {
461 // binding the placeholder accessors to a commandgroup handler
462 inptr.bind(cgh);
463 outptr.bind(cgh);
464 typedef cl::sycl::accessor<OutScalar, 1,
465 cl::sycl::access::mode::read_write,
466 cl::sycl::access::target::local>
467 LocalAccessor;
468
469 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
470 cgh.parallel_for(
471#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
472 program().template get_kernel<sycl_kernel>(),
473#endif
474 thread_range, sycl_kernel(scratch, inptr, outptr, var...));
475 };
476 cl::sycl::event e;
477 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
478 async_synchronize(e);
479 }
480
481 template <typename OutScalar, typename sycl_kernel, typename InPtr,
482 typename Range, typename Index, typename... T>
483 EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
484 Range thread_range,
485 Index scratchSize,
486 T... var) const {
487 auto kernel_functor = [=](cl::sycl::handler &cgh) {
488 // binding the placeholder accessors to a commandgroup handler
489 inptr.bind(cgh);
490 typedef cl::sycl::accessor<OutScalar, 1,
491 cl::sycl::access::mode::read_write,
492 cl::sycl::access::target::local>
493 LocalAccessor;
494
495 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
496 cgh.parallel_for(
497#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
498 program().template get_kernel<sycl_kernel>(),
499#endif
500 thread_range, sycl_kernel(scratch, inptr, var...));
501 };
502 cl::sycl::event e;
503 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
504 async_synchronize(e);
505 }
506
507
508 EIGEN_STRONG_INLINE void synchronize() const {
509#ifdef EIGEN_EXCEPTIONS
510 m_queue.wait_and_throw();
511#else
512 m_queue.wait();
513#endif
514 }
515
516
517 EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
518 set_latest_event(e);
519#ifndef EIGEN_SYCL_ASYNC_EXECUTION
520 synchronize();
521#endif
522 }
523
524 template <typename Index>
525 EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
526 Index &rng, Index &GRange) const {
527 tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
528 tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
529 EIGEN_SYCL_LOCAL_THREAD_DIM1),
530 static_cast<Index>(tileSize));
531 rng = n;
532 if (rng == 0) rng = static_cast<Index>(1);
533 GRange = rng;
534 if (tileSize > GRange)
535 tileSize = GRange;
536 else if (GRange > tileSize) {
537 Index xMode = static_cast<Index>(GRange % tileSize);
538 if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
539 }
540 }
541
544 template <typename Index>
545 EIGEN_STRONG_INLINE void parallel_for_setup(
546 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
547 cl::sycl::range<2> &local_range) const {
548 std::array<Index, 2> input_range = input_dim;
549 Index max_workgroup_Size =
550 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
551 max_workgroup_Size =
552 std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
553 EIGEN_SYCL_LOCAL_THREAD_DIM1),
554 static_cast<Index>(max_workgroup_Size));
555 Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
556 local_range[1] =
557 static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
558 input_range[1] = input_dim[1];
559 if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
560 global_range[1] = input_range[1];
561 if (local_range[1] > global_range[1])
562 local_range[1] = global_range[1];
563 else if (global_range[1] > local_range[1]) {
564 Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
565 if (xMode != 0)
566 global_range[1] += static_cast<Index>(local_range[1] - xMode);
567 }
568 local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
569 input_range[0] = input_dim[0];
570 if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
571 global_range[0] = input_range[0];
572 if (local_range[0] > global_range[0])
573 local_range[0] = global_range[0];
574 else if (global_range[0] > local_range[0]) {
575 Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
576 if (xMode != 0)
577 global_range[0] += static_cast<Index>(local_range[0] - xMode);
578 }
579 }
580
583 template <typename Index>
584 EIGEN_STRONG_INLINE void parallel_for_setup(
585 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
586 cl::sycl::range<3> &local_range) const {
587 std::array<Index, 3> input_range = input_dim;
588 Index max_workgroup_Size =
589 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
590 max_workgroup_Size =
591 std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
592 EIGEN_SYCL_LOCAL_THREAD_DIM1),
593 static_cast<Index>(max_workgroup_Size));
594 Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
595 local_range[2] =
596 static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
597 input_range[2] = input_dim[2];
598 if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
599 global_range[2] = input_range[2];
600 if (local_range[2] > global_range[2])
601 local_range[2] = global_range[2];
602 else if (global_range[2] > local_range[2]) {
603 Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
604 if (xMode != 0)
605 global_range[2] += static_cast<Index>(local_range[2] - xMode);
606 }
607 pow_of_2 = static_cast<Index>(
608 std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
609 local_range[1] =
610 static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
611 input_range[1] = input_dim[1];
612 if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
613 global_range[1] = input_range[1];
614 if (local_range[1] > global_range[1])
615 local_range[1] = global_range[1];
616 else if (global_range[1] > local_range[1]) {
617 Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
618 if (xMode != 0)
619 global_range[1] += static_cast<Index>(local_range[1] - xMode);
620 }
621 local_range[0] = static_cast<Index>(max_workgroup_Size /
622 (local_range[1] * local_range[2]));
623 input_range[0] = input_dim[0];
624 if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
625 global_range[0] = input_range[0];
626 if (local_range[0] > global_range[0])
627 local_range[0] = global_range[0];
628 else if (global_range[0] > local_range[0]) {
629 Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
630 if (xMode != 0)
631 global_range[0] += static_cast<Index>(local_range[0] - xMode);
632 }
633 }
634
635 EIGEN_STRONG_INLINE bool has_local_memory() const {
636#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
637 return false;
638#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
639 return true;
640#else
641 return m_device_info.local_mem_type ==
642 cl::sycl::info::local_mem_type::local;
643#endif
644 }
645
646 EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
647 return m_device_info.max_mem_alloc_size;
648 }
649
650 EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
651 return m_device_info.max_compute_units;
652 }
653
654 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
655 return m_device_info.max_work_group_size;
656 }
657
658 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
659 return m_device_info.max_work_item_sizes;
660 }
661
663 EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
664
665 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
666 // OpenCL doesnot have such concept
667 return 2;
668 }
669
670 EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
671 return m_device_info.local_mem_size;
672 }
673
674 // This function returns the nearest power of 2 Work-group size which is <=
675 // maximum device workgroup size.
676 EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
677 return getPowerOfTwo(m_device_info.max_work_group_size, false);
678 }
679
680 EIGEN_STRONG_INLINE std::string getPlatformName() const {
681 return m_device_info.platform_name;
682 }
683
684 EIGEN_STRONG_INLINE std::string getDeviceName() const {
685 return m_device_info.device_name;
686 }
687
688 EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
689 return m_device_info.device_vendor;
690 }
691
692 // This function returns the nearest power of 2
693 // if roundup is true returns result>=wgsize
694 // else it return result <= wgsize
695 EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
696 if (roundUp) --wGSize;
697 wGSize |= (wGSize >> 1);
698 wGSize |= (wGSize >> 2);
699 wGSize |= (wGSize >> 4);
700 wGSize |= (wGSize >> 8);
701 wGSize |= (wGSize >> 16);
702#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
703 wGSize |= (wGSize >> 32);
704#endif
705 return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
706 }
707
708 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
709
710 // This function checks if the runtime recorded an error for the
711 // underlying stream device.
712 EIGEN_STRONG_INLINE bool ok() const {
713 if (!exception_caught_) {
714 synchronize();
715 }
716 return !exception_caught_;
717 }
718
719 EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
720#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
721 std::lock_guard<std::mutex> lock(event_mutex_);
722 return latest_events_[std::this_thread::get_id()];
723#else
724 eigen_assert(false);
725 return cl::sycl::event();
726#endif
727 }
728
729 // destructor
730 ~QueueInterface() {
731 pMapper.clear();
732#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
733 scratch_buffers.clear();
734#endif
735 }
736
737 protected:
738 EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
739#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
740 std::lock_guard<std::mutex> lock(event_mutex_);
741 latest_events_[std::this_thread::get_id()] = e;
742#else
743 EIGEN_UNUSED_VARIABLE(e);
744#endif
745 }
746
747 void synchronize_and_callback(cl::sycl::event e,
748 const std::function<void()> &callback) const {
749 set_latest_event(e);
750 if (callback) {
751 auto callback_ = [=]() {
752#ifdef EIGEN_EXCEPTIONS
753 cl::sycl::event(e).wait_and_throw();
754#else
755 cl::sycl::event(e).wait();
756#endif
757 callback();
758 };
759 m_thread_pool.Schedule(std::move(callback_));
760 } else {
761#ifdef EIGEN_EXCEPTIONS
762 m_queue.wait_and_throw();
763#else
764 m_queue.wait();
765#endif
766 }
767 }
768
769 bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
770 bool exception_caught = false;
771 for (const auto &e : exceptions) {
772 if (e) {
773 exception_caught = true;
774 EIGEN_THROW_X(e);
775 }
776 }
777 return exception_caught;
778 }
779
781 bool exception_caught_ = false;
782
783 mutable std::mutex pmapper_mutex_;
784
785#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
786 mutable std::mutex event_mutex_;
787 mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
788#endif
789
794 mutable TensorSycl::internal::PointerMapper pMapper;
795#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
796 mutable std::unordered_set<void *> scratch_buffers;
797#endif
799 mutable cl::sycl::queue m_queue;
800#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
801 mutable cl::sycl::program m_prog;
802#endif
803
806 mutable Eigen::ThreadPool m_thread_pool;
807
808 const TensorSycl::internal::SyclDeviceInfo m_device_info;
809};
810
811struct SyclDeviceBase {
814 const QueueInterface *m_queue_stream;
815 explicit SyclDeviceBase(const QueueInterface *queue_stream)
816 : m_queue_stream(queue_stream) {}
817 EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
818 return m_queue_stream;
819 }
820};
821
822// Here is a sycl device struct which accept the sycl queue interface
823// as an input
824struct SyclDevice : public SyclDeviceBase {
825 explicit SyclDevice(const QueueInterface *queue_stream)
826 : SyclDeviceBase(queue_stream) {}
827
828 // this is the accessor used to construct the evaluator
829 template <cl::sycl::access::mode AcMd, typename T>
830 EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
831 get_range_accessor(const void *ptr) const {
832 return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
833 }
834
835 // get sycl accessor
836 template <cl::sycl::access::mode AcMd>
837 EIGEN_STRONG_INLINE cl::sycl::accessor<
838 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
839 get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
840 return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
841 }
842
844 EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
845 const void *ptr) const {
846 return queue_stream()->get_sycl_buffer(ptr);
847 }
848
851 template <typename Index>
852 EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
853 Index &rng, Index &GRange) const {
854 queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
855 }
856
859 template <typename Index>
860 EIGEN_STRONG_INLINE void parallel_for_setup(
861 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
862 cl::sycl::range<2> &local_range) const {
863 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
864 }
865
868 template <typename Index>
869 EIGEN_STRONG_INLINE void parallel_for_setup(
870 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
871 cl::sycl::range<3> &local_range) const {
872 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
873 }
874
876 EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
877 return queue_stream()->allocate(num_bytes);
878 }
879
880 EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
881 return queue_stream()->allocate_temp(num_bytes);
882 }
883
885 EIGEN_STRONG_INLINE void deallocate(void *p) const {
886 queue_stream()->deallocate(p);
887 }
888
889 EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
890 queue_stream()->deallocate_temp(buffer);
891 }
892 template <cl::sycl::access::mode AcMd, typename T>
893 EIGEN_STRONG_INLINE void deallocate_temp(
894 const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
895 queue_stream()->deallocate_temp(buffer);
896 }
897 EIGEN_STRONG_INLINE void deallocate_all() const {
898 queue_stream()->deallocate_all();
899 }
900
901 template <typename data_t>
902 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
903 cl::sycl::access::mode::read_write, data_t>
904 get(data_t *data) const {
905 return queue_stream()->get(data);
906 }
907 template <typename data_t>
908 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
909 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
910 data_t>
911 data) const {
912 return queue_stream()->get(data);
913 }
914
916 EIGEN_STRONG_INLINE void *attach_buffer(
917 cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
918 return queue_stream()->attach_buffer(buf);
919 }
921 EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
922 queue_stream()->detach_buffer(p);
923 }
924 EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
925 return queue_stream()->get_offset(ptr);
926 }
927
928 // some runtime conditions that can be applied here
929 EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
930
932 template <typename Index>
933 EIGEN_STRONG_INLINE void memcpyHostToDevice(
934 Index *dst, const Index *src, size_t n,
935 std::function<void()> callback = {}) const {
936 queue_stream()->memcpyHostToDevice(dst, src, n, callback);
937 }
939 template <typename Index>
940 EIGEN_STRONG_INLINE void memcpyDeviceToHost(
941 void *dst, const Index *src, size_t n,
942 std::function<void()> callback = {}) const {
943 queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
944 }
946 template <typename Index>
947 EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
948 queue_stream()->memcpy(dst, src, n);
949 }
951 EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
952 queue_stream()->memset(data, c, n);
953 }
955 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
956 return queue_stream()->sycl_queue();
957 }
958#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
959 EIGEN_STRONG_INLINE cl::sycl::program &program() const {
960 return queue_stream()->program();
961 }
962#endif
963
964 EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
965
966 EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
967 // We won't try to take advantage of the l2 cache for the time being, and
968 // there is no l3 cache on sycl devices.
969 return firstLevelCacheSize();
970 }
971 EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
972 return queue_stream()->getNumSyclMultiProcessors();
973 }
974 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
975 return queue_stream()->maxSyclThreadsPerBlock();
976 }
977 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
978 return queue_stream()->maxWorkItemSizes();
979 }
980 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
981 // OpenCL doesnot have such concept
982 return queue_stream()->maxSyclThreadsPerMultiProcessor();
983 }
984 EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
985 return queue_stream()->sharedMemPerBlock();
986 }
987 EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
988 return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
989 }
990
991 EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
992 return queue_stream()->getPowerOfTwo(val, roundUp);
993 }
995 EIGEN_STRONG_INLINE int majorDeviceVersion() const {
996 return queue_stream()->majorDeviceVersion();
997 }
998
999 EIGEN_STRONG_INLINE void synchronize() const {
1000 queue_stream()->synchronize();
1001 }
1002 EIGEN_STRONG_INLINE void async_synchronize(
1003 cl::sycl::event e = cl::sycl::event()) const {
1004 queue_stream()->async_synchronize(e);
1005 }
1006 EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
1007 return queue_stream()->get_latest_event();
1008 }
1009
1010 // This function checks if the runtime recorded an error for the
1011 // underlying stream device.
1012 EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
1013
1014 EIGEN_STRONG_INLINE bool has_local_memory() const {
1015 return queue_stream()->has_local_memory();
1016 }
1017 EIGEN_STRONG_INLINE long max_buffer_size() const {
1018 return queue_stream()->max_buffer_size();
1019 }
1020 EIGEN_STRONG_INLINE std::string getPlatformName() const {
1021 return queue_stream()->getPlatformName();
1022 }
1023 EIGEN_STRONG_INLINE std::string getDeviceName() const {
1024 return queue_stream()->getDeviceName();
1025 }
1026 EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
1027 return queue_stream()->getDeviceVendor();
1028 }
1029 template <typename OutScalar, typename KernelType, typename... T>
1030 EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
1031 queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
1032 var...);
1033 }
1034 template <typename OutScalar, typename KernelType, typename... T>
1035 EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
1036 queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
1037 var...);
1038 }
1039
1040 template <typename OutScalar, typename KernelType, typename... T>
1041 EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
1042 queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
1043 var...);
1044 }
1045};
1046} // end namespace Eigen
1047
1048#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index