10#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
11#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
16#include "TensorGpuHipCudaDefines.h"
20static const int kGpuScratchSize = 1024;
24class StreamInterface {
26 virtual ~StreamInterface() {}
28 virtual const gpuStream_t& stream()
const = 0;
29 virtual const gpuDeviceProp_t& deviceProperties()
const = 0;
32 virtual void* allocate(
size_t num_bytes)
const = 0;
33 virtual void deallocate(
void* buffer)
const = 0;
36 virtual void* scratchpad()
const = 0;
42 virtual unsigned int* semaphore()
const = 0;
45class GpuDeviceProperties {
47 GpuDeviceProperties() :
48 initialized_(false), first_(true), device_properties_(nullptr) {}
50 ~GpuDeviceProperties() {
51 if (device_properties_) {
52 delete[] device_properties_;
56 EIGEN_STRONG_INLINE
const gpuDeviceProp_t& get(
int device)
const {
57 return device_properties_[device];
60 EIGEN_STRONG_INLINE
bool isInitialized()
const {
72 if (first_.exchange(
false)) {
75 gpuError_t status = gpuGetDeviceCount(&num_devices);
76 if (status != gpuSuccess) {
77 std::cerr <<
"Failed to get the number of GPU devices: "
78 << gpuGetErrorString(status)
80 gpu_assert(status == gpuSuccess);
82 device_properties_ =
new gpuDeviceProp_t[num_devices];
83 for (
int i = 0; i < num_devices; ++i) {
84 status = gpuGetDeviceProperties(&device_properties_[i], i);
85 if (status != gpuSuccess) {
86 std::cerr <<
"Failed to initialize GPU device #"
89 << gpuGetErrorString(status)
91 gpu_assert(status == gpuSuccess);
95 std::atomic_thread_fence(std::memory_order_release);
99 while (!initialized_) {
100 std::atomic_thread_fence(std::memory_order_acquire);
101 std::this_thread::sleep_for(std::chrono::milliseconds(1000));
108 volatile bool initialized_;
109 std::atomic<bool> first_;
110 gpuDeviceProp_t* device_properties_;
113EIGEN_ALWAYS_INLINE
const GpuDeviceProperties& GetGpuDeviceProperties() {
114 static GpuDeviceProperties* deviceProperties =
new GpuDeviceProperties();
115 if (!deviceProperties->isInitialized()) {
116 deviceProperties->initialize();
118 return *deviceProperties;
121EIGEN_ALWAYS_INLINE
const gpuDeviceProp_t& GetGpuDeviceProperties(
int device) {
122 return GetGpuDeviceProperties().get(device);
125static const gpuStream_t default_stream = gpuStreamDefault;
127class GpuStreamDevice :
public StreamInterface {
130 GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
131 gpuGetDevice(&device_);
134 GpuStreamDevice(
int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
139 GpuStreamDevice(
const gpuStream_t* stream,
int device = -1)
140 : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
142 gpuGetDevice(&device_);
145 gpuError_t err = gpuGetDeviceCount(&num_devices);
146 EIGEN_UNUSED_VARIABLE(err)
147 gpu_assert(err == gpuSuccess);
148 gpu_assert(device < num_devices);
153 virtual ~GpuStreamDevice() {
155 deallocate(scratch_);
159 const gpuStream_t& stream()
const {
return *stream_; }
160 const gpuDeviceProp_t& deviceProperties()
const {
161 return GetGpuDeviceProperties(device_);
163 virtual void* allocate(
size_t num_bytes)
const {
164 gpuError_t err = gpuSetDevice(device_);
165 EIGEN_UNUSED_VARIABLE(err)
166 gpu_assert(err == gpuSuccess);
168 err = gpuMalloc(&result, num_bytes);
169 gpu_assert(err == gpuSuccess);
170 gpu_assert(result != NULL);
173 virtual void deallocate(
void* buffer)
const {
174 gpuError_t err = gpuSetDevice(device_);
175 EIGEN_UNUSED_VARIABLE(err)
176 gpu_assert(err == gpuSuccess);
177 gpu_assert(buffer != NULL);
178 err = gpuFree(buffer);
179 gpu_assert(err == gpuSuccess);
182 virtual void* scratchpad()
const {
183 if (scratch_ == NULL) {
184 scratch_ = allocate(kGpuScratchSize +
sizeof(
unsigned int));
189 virtual unsigned int* semaphore()
const {
190 if (semaphore_ == NULL) {
191 char* scratch =
static_cast<char*
>(scratchpad()) + kGpuScratchSize;
192 semaphore_ =
reinterpret_cast<unsigned int*
>(scratch);
193 gpuError_t err = gpuMemsetAsync(semaphore_, 0,
sizeof(
unsigned int), *stream_);
194 EIGEN_UNUSED_VARIABLE(err)
195 gpu_assert(err == gpuSuccess);
201 const gpuStream_t* stream_;
203 mutable void* scratch_;
204 mutable unsigned int* semaphore_;
210 explicit GpuDevice(
const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
211 eigen_assert(stream);
213 explicit GpuDevice(
const StreamInterface* stream,
int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
214 eigen_assert(stream);
217 EIGEN_STRONG_INLINE
const gpuStream_t& stream()
const {
218 return stream_->stream();
221 EIGEN_STRONG_INLINE
void* allocate(
size_t num_bytes)
const {
222 return stream_->allocate(num_bytes);
225 EIGEN_STRONG_INLINE
void deallocate(
void* buffer)
const {
226 stream_->deallocate(buffer);
229 EIGEN_STRONG_INLINE
void* allocate_temp(
size_t num_bytes)
const {
230 return stream_->allocate(num_bytes);
233 EIGEN_STRONG_INLINE
void deallocate_temp(
void* buffer)
const {
234 stream_->deallocate(buffer);
237 template<
typename Type>
238 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data)
const {
242 EIGEN_STRONG_INLINE
void* scratchpad()
const {
243 return stream_->scratchpad();
246 EIGEN_STRONG_INLINE
unsigned int* semaphore()
const {
247 return stream_->semaphore();
250 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpy(
void* dst,
const void* src,
size_t n)
const {
251#ifndef EIGEN_GPU_COMPILE_PHASE
252 gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
254 EIGEN_UNUSED_VARIABLE(err)
255 gpu_assert(err == gpuSuccess);
257 EIGEN_UNUSED_VARIABLE(dst);
258 EIGEN_UNUSED_VARIABLE(src);
259 EIGEN_UNUSED_VARIABLE(n);
260 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
264 EIGEN_STRONG_INLINE
void memcpyHostToDevice(
void* dst,
const void* src,
size_t n)
const {
266 gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
267 EIGEN_UNUSED_VARIABLE(err)
268 gpu_assert(err == gpuSuccess);
271 EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
void* dst,
const void* src,
size_t n)
const {
273 gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
274 EIGEN_UNUSED_VARIABLE(err)
275 gpu_assert(err == gpuSuccess);
278 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memset(
void* buffer,
int c,
size_t n)
const {
279#ifndef EIGEN_GPU_COMPILE_PHASE
280 gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
281 EIGEN_UNUSED_VARIABLE(err)
282 gpu_assert(err == gpuSuccess);
284 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
288 EIGEN_STRONG_INLINE
size_t numThreads()
const {
293 EIGEN_STRONG_INLINE
size_t firstLevelCacheSize()
const {
298 EIGEN_STRONG_INLINE
size_t lastLevelCacheSize()
const {
301 return firstLevelCacheSize();
304 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void synchronize()
const {
305#ifndef EIGEN_GPU_COMPILE_PHASE
306 gpuError_t err = gpuStreamSynchronize(stream_->stream());
307 if (err != gpuSuccess) {
308 std::cerr <<
"Error detected in GPU stream: "
309 << gpuGetErrorString(err)
311 gpu_assert(err == gpuSuccess);
314 gpu_assert(
false &&
"The default device should be used instead to generate kernel code");
318 EIGEN_STRONG_INLINE
int getNumGpuMultiProcessors()
const {
319 return stream_->deviceProperties().multiProcessorCount;
321 EIGEN_STRONG_INLINE
int maxGpuThreadsPerBlock()
const {
322 return stream_->deviceProperties().maxThreadsPerBlock;
324 EIGEN_STRONG_INLINE
int maxGpuThreadsPerMultiProcessor()
const {
325 return stream_->deviceProperties().maxThreadsPerMultiProcessor;
327 EIGEN_STRONG_INLINE
int sharedMemPerBlock()
const {
328 return stream_->deviceProperties().sharedMemPerBlock;
330 EIGEN_STRONG_INLINE
int majorDeviceVersion()
const {
331 return stream_->deviceProperties().major;
333 EIGEN_STRONG_INLINE
int minorDeviceVersion()
const {
334 return stream_->deviceProperties().minor;
337 EIGEN_STRONG_INLINE
int maxBlocks()
const {
343 inline bool ok()
const {
345 gpuError_t error = gpuStreamQuery(stream_->stream());
346 return (error == gpuSuccess) || (error == gpuErrorNotReady);
353 const StreamInterface* stream_;
357#if defined(EIGEN_HIPCC)
359#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
360 hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
361 gpu_assert(hipGetLastError() == hipSuccess);
365#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
366 (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
367 gpu_assert(cudaGetLastError() == cudaSuccess);
373static EIGEN_DEVICE_FUNC
inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
374#ifndef EIGEN_GPU_COMPILE_PHASE
375 gpuError_t status = gpuDeviceSetSharedMemConfig(config);
376 EIGEN_UNUSED_VARIABLE(status)
377 gpu_assert(status == gpuSuccess);
379 EIGEN_UNUSED_VARIABLE(config)
387#include "TensorGpuHipCudaUndefines.h"
Namespace containing all symbols from the Eigen library.