15#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
16#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
17#include <unordered_set>
25struct SyclDeviceInfo {
26 SyclDeviceInfo(cl::sycl::queue queue)
29 .template get_info<cl::sycl::info::device::local_mem_type>()),
33 cl::sycl::info::device::max_work_item_sizes>()),
37 cl::sycl::info::device::max_mem_alloc_size>()),
38 max_compute_units(queue.get_device()
40 cl::sycl::info::device::max_compute_units>()),
44 cl::sycl::info::device::max_work_group_size>()),
47 .template get_info<cl::sycl::info::device::local_mem_size>()),
48 platform_name(queue.get_device()
50 .template get_info<cl::sycl::info::platform::name>()),
51 device_name(queue.get_device()
52 .template get_info<cl::sycl::info::device::name>()),
55 .template get_info<cl::sycl::info::device::vendor>()) {}
57 cl::sycl::info::local_mem_type local_mem_type;
58 cl::sycl::id<3> max_work_item_sizes;
59 unsigned long max_mem_alloc_size;
60 unsigned long max_compute_units;
61 unsigned long max_work_group_size;
62 size_t local_mem_size;
63 std::string platform_name;
64 std::string device_name;
65 std::string device_vendor;
71typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
75EIGEN_STRONG_INLINE
auto get_sycl_supported_devices()
76 ->
decltype(cl::sycl::device::get_devices()) {
77#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
78 return {cl::sycl::device(cl::sycl::default_selector())};
80 std::vector<cl::sycl::device> supported_devices;
81 auto platform_list = cl::sycl::platform::get_platforms();
82 for (
const auto &platform : platform_list) {
83 auto device_list = platform.get_devices();
85 platform.template get_info<cl::sycl::info::platform::name>();
86 std::transform(platform_name.begin(), platform_name.end(),
87 platform_name.begin(), ::tolower);
88 for (
const auto &device : device_list) {
89 auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
90 std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
91 bool unsupported_condition =
92 (device.is_cpu() && platform_name.find(
"amd") != std::string::npos &&
93 vendor.find(
"apu") == std::string::npos) ||
94 (platform_name.find(
"experimental") != std::string::npos) ||
96 if (!unsupported_condition) {
97 supported_devices.push_back(device);
101 return supported_devices;
105class QueueInterface {
108 template <
typename DeviceOrSelector>
109 explicit QueueInterface(
110 const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
111 unsigned num_threads = std::thread::hardware_concurrency())
112 : m_queue(dev_or_sel, handler),
113#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
114 m_prog(m_queue.get_context(), get_sycl_supported_devices()),
116 m_thread_pool(num_threads),
117 m_device_info(m_queue) {
118#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
119 m_prog.build_with_kernel_type<DeviceOrSelector>();
120 auto f = [&](cl::sycl::handler &cgh) {
121 cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
124 EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
128 template <
typename DeviceOrSelector>
129 explicit QueueInterface(
130 const DeviceOrSelector &dev_or_sel,
131 unsigned num_threads = std::thread::hardware_concurrency())
132 : QueueInterface(dev_or_sel,
133 [this](cl::sycl::exception_list l) {
134 this->exception_caught_ = this->sycl_async_handler(l);
138#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
139 EIGEN_STRONG_INLINE cl::sycl::program &program()
const {
return m_prog; }
143 EIGEN_STRONG_INLINE
void *attach_buffer(
144 cl::sycl::buffer<buffer_scalar_t, 1> &buf)
const {
145 std::lock_guard<std::mutex> lock(pmapper_mutex_);
146 return static_cast<void *
>(pMapper.add_pointer(buf));
150 EIGEN_STRONG_INLINE
void detach_buffer(
void *p)
const {
151 std::lock_guard<std::mutex> lock(pmapper_mutex_);
152 TensorSycl::internal::SYCLfree<false>(p, pMapper);
163 EIGEN_STRONG_INLINE
void *allocate(
size_t num_bytes)
const {
164#if EIGEN_MAX_ALIGN_BYTES > 0
165 size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
167 num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
170 std::lock_guard<std::mutex> lock(pmapper_mutex_);
171 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
174 EIGEN_STRONG_INLINE
void *allocate_temp(
size_t num_bytes)
const {
175#if EIGEN_MAX_ALIGN_BYTES > 0
176 size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
178 num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
181 std::lock_guard<std::mutex> lock(pmapper_mutex_);
182#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
183 if (scratch_buffers.empty()) {
184 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
187 for (
auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
188 auto buff = pMapper.get_buffer(*it);
189 if (buff.get_size() >= num_bytes) {
191 scratch_buffers.erase(it);
197 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
200 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
203 template <
typename data_t>
204 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
205 cl::sycl::access::mode::read_write, data_t>
206 get(data_t *data)
const {
207 return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
209 template <
typename data_t>
210 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
211 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
214 return static_cast<data_t *
>(data.get_virtual_pointer());
217 EIGEN_STRONG_INLINE
void deallocate_temp(
void *p)
const {
218 std::lock_guard<std::mutex> lock(pmapper_mutex_);
219#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
220 scratch_buffers.insert(p);
222 TensorSycl::internal::SYCLfree(p, pMapper);
225 template <cl::sycl::access::mode AcMd,
typename T>
226 EIGEN_STRONG_INLINE
void deallocate_temp(
227 const TensorSycl::internal::RangeAccess<AcMd, T> &p)
const {
228 deallocate_temp(p.get_virtual_pointer());
233 EIGEN_STRONG_INLINE
void deallocate(
void *p)
const {
234 std::lock_guard<std::mutex> lock(pmapper_mutex_);
235 TensorSycl::internal::SYCLfree(p, pMapper);
238 EIGEN_STRONG_INLINE
void deallocate_all()
const {
239 std::lock_guard<std::mutex> lock(pmapper_mutex_);
240 TensorSycl::internal::SYCLfreeAll(pMapper);
241#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
242 scratch_buffers.clear();
250 EIGEN_STRONG_INLINE
void memcpyHostToDevice(
251 void *dst,
const void *src,
size_t n,
252 std::function<
void()> callback)
const {
253 static const auto write_mode = cl::sycl::access::mode::discard_write;
254 static const auto global_access = cl::sycl::access::target::global_buffer;
255 typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
258 if (callback) callback();
261 n /=
sizeof(buffer_scalar_t);
262 auto f = [&](cl::sycl::handler &cgh) {
263 write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
264 buffer_scalar_t
const *ptr =
static_cast<buffer_scalar_t
const *
>(src);
265 auto non_deleter = [](buffer_scalar_t
const *) {};
266 std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
267 cgh.copy(s_ptr, dst_acc);
270 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
271 synchronize_and_callback(e, callback);
278 EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
279 void *dst,
const void *src,
size_t n,
280 std::function<
void()> callback)
const {
281 static const auto read_mode = cl::sycl::access::mode::read;
282 static const auto global_access = cl::sycl::access::target::global_buffer;
283 typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
286 if (callback) callback();
289 n /=
sizeof(buffer_scalar_t);
290 auto f = [&](cl::sycl::handler &cgh) {
291 read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
292 buffer_scalar_t *ptr =
static_cast<buffer_scalar_t *
>(dst);
293 auto non_deleter = [](buffer_scalar_t *) {};
294 std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
295 cgh.copy(src_acc, s_ptr);
298 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
299 synchronize_and_callback(e, callback);
305 EIGEN_STRONG_INLINE
void memcpy(
void *dst,
const void *src,
size_t n)
const {
306 static const auto read_mode = cl::sycl::access::mode::read;
307 static const auto write_mode = cl::sycl::access::mode::discard_write;
311 n /=
sizeof(buffer_scalar_t);
312 auto f = [&](cl::sycl::handler &cgh) {
313 auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
314 auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
315 cgh.copy(src_acc, dst_acc);
318 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
319 async_synchronize(e);
325 EIGEN_STRONG_INLINE
void memset(
void *data,
int c,
size_t n)
const {
326 static const auto write_mode = cl::sycl::access::mode::discard_write;
330 n /=
sizeof(buffer_scalar_t);
331 auto f = [&](cl::sycl::handler &cgh) {
332 auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
336 cgh.fill(dst_acc,
static_cast<buffer_scalar_t
>(
static_cast<uint8_t
>(c)));
339 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
340 async_synchronize(e);
350 template <cl::sycl::access::mode AcMd,
typename T>
351 EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
352 get_range_accessor(
const void *ptr)
const {
353 static const auto global_access = cl::sycl::access::target::global_buffer;
354 static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
355 typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
356 typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
358 std::lock_guard<std::mutex> lock(pmapper_mutex_);
360 auto original_buffer = pMapper.get_buffer(ptr);
361 const ptrdiff_t offset = pMapper.get_offset(ptr);
362 const ptrdiff_t typed_offset = offset /
sizeof(T);
363 eigen_assert(typed_offset >= 0);
364 const auto typed_size = original_buffer.get_size() /
sizeof(T);
365 auto buffer = original_buffer.template reinterpret<
366 typename Eigen::internal::remove_const<T>::type>(
367 cl::sycl::range<1>(typed_size));
368 const ptrdiff_t size = buffer.get_count() - typed_offset;
369 eigen_assert(size >= 0);
370 typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
371 1, AcMd, global_access, is_place_holder>
372 placeholder_accessor_t;
373 const auto start_ptr =
static_cast<internal_ptr_t
>(ptr) - offset;
374 return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
375 cl::sycl::id<1>(typed_offset)),
376 static_cast<size_t>(typed_offset),
377 reinterpret_cast<std::intptr_t
>(start_ptr));
382 template <cl::sycl::access::mode AcMd,
typename Index>
383 EIGEN_STRONG_INLINE cl::sycl::accessor<
384 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
385 get_range_accessor(cl::sycl::handler &cgh,
const void *ptr,
386 const Index n_bytes)
const {
387 static const auto global_access = cl::sycl::access::target::global_buffer;
388 eigen_assert(n_bytes >= 0);
389 std::lock_guard<std::mutex> lock(pmapper_mutex_);
390 auto buffer = pMapper.get_buffer(ptr);
391 const ptrdiff_t offset = pMapper.get_offset(ptr);
392 eigen_assert(offset >= 0);
393 eigen_assert(offset + n_bytes <= buffer.get_size());
394 return buffer.template get_access<AcMd, global_access>(
395 cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
402 template <cl::sycl::access::mode AcMd>
403 EIGEN_STRONG_INLINE cl::sycl::accessor<
404 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
405 get_sycl_accessor(cl::sycl::handler &cgh,
const void *ptr)
const {
406 std::lock_guard<std::mutex> lock(pmapper_mutex_);
407 return pMapper.get_buffer(ptr)
408 .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
412 EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
413 const void *ptr)
const {
414 std::lock_guard<std::mutex> lock(pmapper_mutex_);
415 return pMapper.get_buffer(ptr);
418 EIGEN_STRONG_INLINE ptrdiff_t get_offset(
const void *ptr)
const {
419 std::lock_guard<std::mutex> lock(pmapper_mutex_);
420 return pMapper.get_offset(ptr);
423 template <
typename OutScalar,
typename sycl_kernel,
typename Lhs,
424 typename Rhs,
typename OutPtr,
typename Range,
typename Index,
426 EIGEN_ALWAYS_INLINE
void binary_kernel_launcher(
const Lhs &lhs,
427 const Rhs &rhs, OutPtr outptr,
431 auto kernel_functor = [=](cl::sycl::handler &cgh) {
436 typedef cl::sycl::accessor<OutScalar, 1,
437 cl::sycl::access::mode::read_write,
438 cl::sycl::access::target::local>
441 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
443#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
444 program().
template get_kernel<sycl_kernel>(),
446 thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
449 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
450 async_synchronize(e);
453 template <
typename OutScalar,
typename sycl_kernel,
typename InPtr,
454 typename OutPtr,
typename Range,
typename Index,
typename... T>
455 EIGEN_ALWAYS_INLINE
void unary_kernel_launcher(
const InPtr &inptr,
460 auto kernel_functor = [=](cl::sycl::handler &cgh) {
464 typedef cl::sycl::accessor<OutScalar, 1,
465 cl::sycl::access::mode::read_write,
466 cl::sycl::access::target::local>
469 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
471#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
472 program().
template get_kernel<sycl_kernel>(),
474 thread_range, sycl_kernel(scratch, inptr, outptr, var...));
477 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
478 async_synchronize(e);
481 template <
typename OutScalar,
typename sycl_kernel,
typename InPtr,
482 typename Range,
typename Index,
typename... T>
483 EIGEN_ALWAYS_INLINE
void nullary_kernel_launcher(
const InPtr &inptr,
487 auto kernel_functor = [=](cl::sycl::handler &cgh) {
490 typedef cl::sycl::accessor<OutScalar, 1,
491 cl::sycl::access::mode::read_write,
492 cl::sycl::access::target::local>
495 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
497#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
498 program().
template get_kernel<sycl_kernel>(),
500 thread_range, sycl_kernel(scratch, inptr, var...));
503 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
504 async_synchronize(e);
508 EIGEN_STRONG_INLINE
void synchronize()
const {
509#ifdef EIGEN_EXCEPTIONS
510 m_queue.wait_and_throw();
517 EIGEN_STRONG_INLINE
void async_synchronize(cl::sycl::event e)
const {
519#ifndef EIGEN_SYCL_ASYNC_EXECUTION
524 template <
typename Index>
525 EIGEN_STRONG_INLINE
void parallel_for_setup(Index n, Index &tileSize,
526 Index &rng, Index &GRange)
const {
527 tileSize =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
528 tileSize = std::min(
static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
529 EIGEN_SYCL_LOCAL_THREAD_DIM1),
530 static_cast<Index>(tileSize));
532 if (rng == 0) rng =
static_cast<Index>(1);
534 if (tileSize > GRange)
536 else if (GRange > tileSize) {
537 Index xMode =
static_cast<Index>(GRange % tileSize);
538 if (xMode != 0) GRange +=
static_cast<Index>(tileSize - xMode);
544 template <
typename Index>
545 EIGEN_STRONG_INLINE
void parallel_for_setup(
546 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
547 cl::sycl::range<2> &local_range)
const {
548 std::array<Index, 2> input_range = input_dim;
549 Index max_workgroup_Size =
550 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
552 std::min(
static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
553 EIGEN_SYCL_LOCAL_THREAD_DIM1),
554 static_cast<Index>(max_workgroup_Size));
555 Index pow_of_2 =
static_cast<Index>(std::log2(max_workgroup_Size));
557 static_cast<Index>(std::pow(2,
static_cast<Index>(pow_of_2 / 2)));
558 input_range[1] = input_dim[1];
559 if (input_range[1] == 0) input_range[1] =
static_cast<Index>(1);
560 global_range[1] = input_range[1];
561 if (local_range[1] > global_range[1])
562 local_range[1] = global_range[1];
563 else if (global_range[1] > local_range[1]) {
564 Index xMode =
static_cast<Index>(global_range[1] % local_range[1]);
566 global_range[1] +=
static_cast<Index>(local_range[1] - xMode);
568 local_range[0] =
static_cast<Index>(max_workgroup_Size / local_range[1]);
569 input_range[0] = input_dim[0];
570 if (input_range[0] == 0) input_range[0] =
static_cast<Index>(1);
571 global_range[0] = input_range[0];
572 if (local_range[0] > global_range[0])
573 local_range[0] = global_range[0];
574 else if (global_range[0] > local_range[0]) {
575 Index xMode =
static_cast<Index>(global_range[0] % local_range[0]);
577 global_range[0] +=
static_cast<Index>(local_range[0] - xMode);
583 template <
typename Index>
584 EIGEN_STRONG_INLINE
void parallel_for_setup(
585 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
586 cl::sycl::range<3> &local_range)
const {
587 std::array<Index, 3> input_range = input_dim;
588 Index max_workgroup_Size =
589 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
591 std::min(
static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
592 EIGEN_SYCL_LOCAL_THREAD_DIM1),
593 static_cast<Index>(max_workgroup_Size));
594 Index pow_of_2 =
static_cast<Index>(std::log2(max_workgroup_Size));
596 static_cast<Index>(std::pow(2,
static_cast<Index>(pow_of_2 / 3)));
597 input_range[2] = input_dim[2];
598 if (input_range[2] == 0) input_range[1] =
static_cast<Index>(1);
599 global_range[2] = input_range[2];
600 if (local_range[2] > global_range[2])
601 local_range[2] = global_range[2];
602 else if (global_range[2] > local_range[2]) {
603 Index xMode =
static_cast<Index>(global_range[2] % local_range[2]);
605 global_range[2] +=
static_cast<Index>(local_range[2] - xMode);
607 pow_of_2 =
static_cast<Index>(
608 std::log2(
static_cast<Index>(max_workgroup_Size / local_range[2])));
610 static_cast<Index>(std::pow(2,
static_cast<Index>(pow_of_2 / 2)));
611 input_range[1] = input_dim[1];
612 if (input_range[1] == 0) input_range[1] =
static_cast<Index>(1);
613 global_range[1] = input_range[1];
614 if (local_range[1] > global_range[1])
615 local_range[1] = global_range[1];
616 else if (global_range[1] > local_range[1]) {
617 Index xMode =
static_cast<Index>(global_range[1] % local_range[1]);
619 global_range[1] +=
static_cast<Index>(local_range[1] - xMode);
621 local_range[0] =
static_cast<Index>(max_workgroup_Size /
622 (local_range[1] * local_range[2]));
623 input_range[0] = input_dim[0];
624 if (input_range[0] == 0) input_range[0] =
static_cast<Index>(1);
625 global_range[0] = input_range[0];
626 if (local_range[0] > global_range[0])
627 local_range[0] = global_range[0];
628 else if (global_range[0] > local_range[0]) {
629 Index xMode =
static_cast<Index>(global_range[0] % local_range[0]);
631 global_range[0] +=
static_cast<Index>(local_range[0] - xMode);
635 EIGEN_STRONG_INLINE
bool has_local_memory()
const {
636#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
638#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
641 return m_device_info.local_mem_type ==
642 cl::sycl::info::local_mem_type::local;
646 EIGEN_STRONG_INLINE
unsigned long max_buffer_size()
const {
647 return m_device_info.max_mem_alloc_size;
650 EIGEN_STRONG_INLINE
unsigned long getNumSyclMultiProcessors()
const {
651 return m_device_info.max_compute_units;
654 EIGEN_STRONG_INLINE
unsigned long maxSyclThreadsPerBlock()
const {
655 return m_device_info.max_work_group_size;
658 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes()
const {
659 return m_device_info.max_work_item_sizes;
663 EIGEN_STRONG_INLINE
int majorDeviceVersion()
const {
return 1; }
665 EIGEN_STRONG_INLINE
unsigned long maxSyclThreadsPerMultiProcessor()
const {
670 EIGEN_STRONG_INLINE
size_t sharedMemPerBlock()
const {
671 return m_device_info.local_mem_size;
676 EIGEN_STRONG_INLINE
size_t getNearestPowerOfTwoWorkGroupSize()
const {
677 return getPowerOfTwo(m_device_info.max_work_group_size,
false);
680 EIGEN_STRONG_INLINE std::string getPlatformName()
const {
681 return m_device_info.platform_name;
684 EIGEN_STRONG_INLINE std::string getDeviceName()
const {
685 return m_device_info.device_name;
688 EIGEN_STRONG_INLINE std::string getDeviceVendor()
const {
689 return m_device_info.device_vendor;
695 EIGEN_STRONG_INLINE
size_t getPowerOfTwo(
size_t wGSize,
bool roundUp)
const {
696 if (roundUp) --wGSize;
697 wGSize |= (wGSize >> 1);
698 wGSize |= (wGSize >> 2);
699 wGSize |= (wGSize >> 4);
700 wGSize |= (wGSize >> 8);
701 wGSize |= (wGSize >> 16);
702#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
703 wGSize |= (wGSize >> 32);
705 return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
708 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue()
const {
return m_queue; }
712 EIGEN_STRONG_INLINE
bool ok()
const {
713 if (!exception_caught_) {
716 return !exception_caught_;
719 EIGEN_STRONG_INLINE cl::sycl::event get_latest_event()
const {
720#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
721 std::lock_guard<std::mutex> lock(event_mutex_);
722 return latest_events_[std::this_thread::get_id()];
725 return cl::sycl::event();
732#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
733 scratch_buffers.clear();
738 EIGEN_STRONG_INLINE
void set_latest_event(cl::sycl::event e)
const {
739#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
740 std::lock_guard<std::mutex> lock(event_mutex_);
741 latest_events_[std::this_thread::get_id()] = e;
743 EIGEN_UNUSED_VARIABLE(e);
747 void synchronize_and_callback(cl::sycl::event e,
748 const std::function<
void()> &callback)
const {
751 auto callback_ = [=]() {
752#ifdef EIGEN_EXCEPTIONS
753 cl::sycl::event(e).wait_and_throw();
755 cl::sycl::event(e).wait();
759 m_thread_pool.Schedule(std::move(callback_));
761#ifdef EIGEN_EXCEPTIONS
762 m_queue.wait_and_throw();
769 bool sycl_async_handler(cl::sycl::exception_list exceptions)
const {
770 bool exception_caught =
false;
771 for (
const auto &e : exceptions) {
773 exception_caught =
true;
777 return exception_caught;
781 bool exception_caught_ =
false;
783 mutable std::mutex pmapper_mutex_;
785#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
786 mutable std::mutex event_mutex_;
787 mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
794 mutable TensorSycl::internal::PointerMapper pMapper;
795#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
796 mutable std::unordered_set<void *> scratch_buffers;
799 mutable cl::sycl::queue m_queue;
800#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
801 mutable cl::sycl::program m_prog;
806 mutable Eigen::ThreadPool m_thread_pool;
808 const TensorSycl::internal::SyclDeviceInfo m_device_info;
811struct SyclDeviceBase {
814 const QueueInterface *m_queue_stream;
815 explicit SyclDeviceBase(
const QueueInterface *queue_stream)
816 : m_queue_stream(queue_stream) {}
817 EIGEN_STRONG_INLINE
const QueueInterface *queue_stream()
const {
818 return m_queue_stream;
824struct SyclDevice :
public SyclDeviceBase {
825 explicit SyclDevice(
const QueueInterface *queue_stream)
826 : SyclDeviceBase(queue_stream) {}
829 template <cl::sycl::access::mode AcMd,
typename T>
830 EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
831 get_range_accessor(
const void *ptr)
const {
832 return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
836 template <cl::sycl::access::mode AcMd>
837 EIGEN_STRONG_INLINE cl::sycl::accessor<
838 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
839 get_sycl_accessor(cl::sycl::handler &cgh,
const void *ptr)
const {
840 return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
844 EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
845 const void *ptr)
const {
846 return queue_stream()->get_sycl_buffer(ptr);
851 template <
typename Index>
852 EIGEN_STRONG_INLINE
void parallel_for_setup(Index n, Index &tileSize,
853 Index &rng, Index &GRange)
const {
854 queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
859 template <
typename Index>
860 EIGEN_STRONG_INLINE
void parallel_for_setup(
861 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
862 cl::sycl::range<2> &local_range)
const {
863 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
868 template <
typename Index>
869 EIGEN_STRONG_INLINE
void parallel_for_setup(
870 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
871 cl::sycl::range<3> &local_range)
const {
872 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
876 EIGEN_STRONG_INLINE
void *allocate(
size_t num_bytes)
const {
877 return queue_stream()->allocate(num_bytes);
880 EIGEN_STRONG_INLINE
void *allocate_temp(
size_t num_bytes)
const {
881 return queue_stream()->allocate_temp(num_bytes);
885 EIGEN_STRONG_INLINE
void deallocate(
void *p)
const {
886 queue_stream()->deallocate(p);
889 EIGEN_STRONG_INLINE
void deallocate_temp(
void *buffer)
const {
890 queue_stream()->deallocate_temp(buffer);
892 template <cl::sycl::access::mode AcMd,
typename T>
893 EIGEN_STRONG_INLINE
void deallocate_temp(
894 const TensorSycl::internal::RangeAccess<AcMd, T> &buffer)
const {
895 queue_stream()->deallocate_temp(buffer);
897 EIGEN_STRONG_INLINE
void deallocate_all()
const {
898 queue_stream()->deallocate_all();
901 template <
typename data_t>
902 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
903 cl::sycl::access::mode::read_write, data_t>
904 get(data_t *data)
const {
905 return queue_stream()->get(data);
907 template <
typename data_t>
908 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
909 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
912 return queue_stream()->get(data);
916 EIGEN_STRONG_INLINE
void *attach_buffer(
917 cl::sycl::buffer<buffer_scalar_t, 1> &buf)
const {
918 return queue_stream()->attach_buffer(buf);
921 EIGEN_STRONG_INLINE
void detach_buffer(
void *p)
const {
922 queue_stream()->detach_buffer(p);
924 EIGEN_STRONG_INLINE ptrdiff_t get_offset(
const void *ptr)
const {
925 return queue_stream()->get_offset(ptr);
929 EIGEN_STRONG_INLINE
bool isDeviceSuitable()
const {
return true; }
932 template <
typename Index>
933 EIGEN_STRONG_INLINE
void memcpyHostToDevice(
934 Index *dst,
const Index *src,
size_t n,
935 std::function<
void()> callback = {})
const {
936 queue_stream()->memcpyHostToDevice(dst, src, n, callback);
939 template <
typename Index>
940 EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
941 void *dst,
const Index *src,
size_t n,
942 std::function<
void()> callback = {})
const {
943 queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
946 template <
typename Index>
947 EIGEN_STRONG_INLINE
void memcpy(
void *dst,
const Index *src,
size_t n)
const {
948 queue_stream()->memcpy(dst, src, n);
951 EIGEN_STRONG_INLINE
void memset(
void *data,
int c,
size_t n)
const {
952 queue_stream()->memset(data, c, n);
955 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue()
const {
956 return queue_stream()->sycl_queue();
958#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
959 EIGEN_STRONG_INLINE cl::sycl::program &program()
const {
960 return queue_stream()->program();
964 EIGEN_STRONG_INLINE
size_t firstLevelCacheSize()
const {
return 48 * 1024; }
966 EIGEN_STRONG_INLINE
size_t lastLevelCacheSize()
const {
969 return firstLevelCacheSize();
971 EIGEN_STRONG_INLINE
unsigned long getNumSyclMultiProcessors()
const {
972 return queue_stream()->getNumSyclMultiProcessors();
974 EIGEN_STRONG_INLINE
unsigned long maxSyclThreadsPerBlock()
const {
975 return queue_stream()->maxSyclThreadsPerBlock();
977 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes()
const {
978 return queue_stream()->maxWorkItemSizes();
980 EIGEN_STRONG_INLINE
unsigned long maxSyclThreadsPerMultiProcessor()
const {
982 return queue_stream()->maxSyclThreadsPerMultiProcessor();
984 EIGEN_STRONG_INLINE
size_t sharedMemPerBlock()
const {
985 return queue_stream()->sharedMemPerBlock();
987 EIGEN_STRONG_INLINE
size_t getNearestPowerOfTwoWorkGroupSize()
const {
988 return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
991 EIGEN_STRONG_INLINE
size_t getPowerOfTwo(
size_t val,
bool roundUp)
const {
992 return queue_stream()->getPowerOfTwo(val, roundUp);
995 EIGEN_STRONG_INLINE
int majorDeviceVersion()
const {
996 return queue_stream()->majorDeviceVersion();
999 EIGEN_STRONG_INLINE
void synchronize()
const {
1000 queue_stream()->synchronize();
1002 EIGEN_STRONG_INLINE
void async_synchronize(
1003 cl::sycl::event e = cl::sycl::event())
const {
1004 queue_stream()->async_synchronize(e);
1006 EIGEN_STRONG_INLINE cl::sycl::event get_latest_event()
const {
1007 return queue_stream()->get_latest_event();
1012 EIGEN_STRONG_INLINE
bool ok()
const {
return queue_stream()->ok(); }
1014 EIGEN_STRONG_INLINE
bool has_local_memory()
const {
1015 return queue_stream()->has_local_memory();
1017 EIGEN_STRONG_INLINE
long max_buffer_size()
const {
1018 return queue_stream()->max_buffer_size();
1020 EIGEN_STRONG_INLINE std::string getPlatformName()
const {
1021 return queue_stream()->getPlatformName();
1023 EIGEN_STRONG_INLINE std::string getDeviceName()
const {
1024 return queue_stream()->getDeviceName();
1026 EIGEN_STRONG_INLINE std::string getDeviceVendor()
const {
1027 return queue_stream()->getDeviceVendor();
1029 template <
typename OutScalar,
typename KernelType,
typename... T>
1030 EIGEN_ALWAYS_INLINE
void binary_kernel_launcher(T... var)
const {
1031 queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
1034 template <
typename OutScalar,
typename KernelType,
typename... T>
1035 EIGEN_ALWAYS_INLINE
void unary_kernel_launcher(T... var)
const {
1036 queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
1040 template <
typename OutScalar,
typename KernelType,
typename... T>
1041 EIGEN_ALWAYS_INLINE
void nullary_kernel_launcher(T... var)
const {
1042 queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index