tesseract 5.2.0
Loading...
Searching...
No Matches
simddetect.cpp
Go to the documentation of this file.
1
2// File: simddetect.cpp
3// Description: Architecture detector.
4// Author: Stefan Weil (based on code from Ray Smith)
5//
6// (C) Copyright 2014, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
17
18#ifdef HAVE_CONFIG_H
19# include "config_auto.h" // for HAVE_AVX, ...
20#endif
21#include <numeric> // for std::inner_product
22#include "dotproduct.h"
23#include "intsimdmatrix.h" // for IntSimdMatrix
24#include "params.h" // for STRING_VAR
25#include "simddetect.h"
26#include "tprintf.h" // for tprintf
27
28#if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)
29// The GNU compiler g++ fails to compile with the Accelerate framework
30// (tested with versions 10 and 11), so unconditionally disable it.
31#undef HAVE_FRAMEWORK_ACCELERATE
32#endif
33
34#if defined(HAVE_FRAMEWORK_ACCELERATE)
35
36// Use Apple Accelerate framework.
37// https://developer.apple.com/documentation/accelerate/simd
38
39#include <Accelerate/Accelerate.h>
40
41#endif
42
43#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
44# define HAS_CPUID
45#endif
46
47#if defined(HAS_CPUID)
48# if defined(__GNUC__)
49# include <cpuid.h>
50# elif defined(_WIN32)
51# include <intrin.h>
52# endif
53#endif
54
55#if defined(HAVE_NEON) && !defined(__aarch64__)
56# if defined(HAVE_ANDROID_GETCPUFAMILY)
57# include <cpu-features.h>
58# elif defined(HAVE_GETAUXVAL)
59# include <asm/hwcap.h>
60# include <sys/auxv.h>
61# elif defined(HAVE_ELF_AUX_INFO)
62# include <sys/auxv.h>
63# include <sys/elf.h>
64# endif
65#endif
66
67namespace tesseract {
68
69// Computes and returns the dot product of the two n-vectors u and v.
70// Note: because the order of addition is different among the different dot
71// product functions, the results can (and do) vary slightly (although they
72// agree to within about 4e-15). This produces different results when running
73// training, despite all random inputs being precisely equal.
74// To get consistent results, use just one of these dot product functions.
75// On a test multi-layer network, serial is 57% slower than SSE, and AVX
76// is about 8% faster than SSE. This suggests that the time is memory
77// bandwidth constrained and could benefit from holding the reused vector
78// in AVX registers.
80
81static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");
82
83SIMDDetect SIMDDetect::detector;
84
85#if defined(__aarch64__)
86// ARMv8 always has NEON.
87bool SIMDDetect::neon_available_ = true;
88#elif defined(HAVE_NEON)
89// If true, then Neon has been detected.
90bool SIMDDetect::neon_available_;
91#else
92// If true, then AVX has been detected.
93bool SIMDDetect::avx_available_;
94bool SIMDDetect::avx2_available_;
95bool SIMDDetect::avx512F_available_;
96bool SIMDDetect::avx512BW_available_;
97// If true, then FMA has been detected.
98bool SIMDDetect::fma_available_;
99// If true, then SSe4.1 has been detected.
100bool SIMDDetect::sse_available_;
101#endif
102
103#if defined(HAVE_FRAMEWORK_ACCELERATE)
104static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) {
105 TFloat total = 0;
106 const int stride = 1;
107#if defined(FAST_FLOAT)
108 vDSP_dotpr(u, stride, v, stride, &total, n);
109#else
110 vDSP_dotprD(u, stride, v, stride, &total, n);
111#endif
112 return total;
113}
114#endif
115
116// Computes and returns the dot product of the two n-vectors u and v.
117static TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {
118 TFloat total = 0;
119 for (int k = 0; k < n; ++k) {
120 total += u[k] * v[k];
121 }
122 return total;
123}
124
125// Compute dot product using std::inner_product.
126static TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) {
127 return std::inner_product(u, u + n, v, static_cast<TFloat>(0));
128}
129
130static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
131 DotProduct = f;
133}
134
135// Constructor.
136// Tests the architecture in a system-dependent way to detect AVX, SSE and
137// any other available SIMD equipment.
138// __GNUC__ is also defined by compilers that include GNU extensions such as
139// clang.
140SIMDDetect::SIMDDetect() {
141 // The fallback is a generic dot product calculation.
142 SetDotProduct(DotProductGeneric);
143
144#if defined(HAS_CPUID)
145# if defined(__GNUC__)
146 unsigned int eax, ebx, ecx, edx;
147 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
148 // Note that these tests all use hex because the older compilers don't have
149 // the newer flags.
150# if defined(HAVE_SSE4_1)
151 sse_available_ = (ecx & 0x00080000) != 0;
152# endif
153# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
154 auto xgetbv = []() {
155 uint32_t xcr0;
156 __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
157 return xcr0;
158 };
159 if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
160 // OSXSAVE bit is set, XMM state and YMM state are fine.
161# if defined(HAVE_FMA)
162 fma_available_ = (ecx & 0x00001000) != 0;
163# endif
164# if defined(HAVE_AVX)
165 avx_available_ = (ecx & 0x10000000) != 0;
166 if (avx_available_) {
167 // There is supposed to be a __get_cpuid_count function, but this is all
168 // there is in my cpuid.h. It is a macro for an asm statement and cannot
169 // be used inside an if.
170 __cpuid_count(7, 0, eax, ebx, ecx, edx);
171 avx2_available_ = (ebx & 0x00000020) != 0;
172 avx512F_available_ = (ebx & 0x00010000) != 0;
173 avx512BW_available_ = (ebx & 0x40000000) != 0;
174 }
175# endif
176 }
177# endif
178 }
179# elif defined(_WIN32)
180 int cpuInfo[4];
181 int max_function_id;
182 __cpuid(cpuInfo, 0);
183 max_function_id = cpuInfo[0];
184 if (max_function_id >= 1) {
185 __cpuid(cpuInfo, 1);
186# if defined(HAVE_SSE4_1)
187 sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
188# endif
189# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
190 if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
191 // OSXSAVE bit is set, XMM state and YMM state are fine.
192# if defined(HAVE_FMA)
193 fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
194# endif
195# if defined(HAVE_AVX)
196 avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
197# endif
198# if defined(HAVE_AVX2)
199 if (max_function_id >= 7) {
200 __cpuid(cpuInfo, 7);
201 avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
202 avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
203 avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
204 }
205# endif
206 }
207# endif
208 }
209# else
210# error "I don't know how to test for SIMD with this compiler"
211# endif
212#endif
213
214#if defined(HAVE_NEON) && !defined(__aarch64__)
215# if defined(HAVE_ANDROID_GETCPUFAMILY)
216 {
217 AndroidCpuFamily family = android_getCpuFamily();
218 if (family == ANDROID_CPU_FAMILY_ARM)
219 neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
220 }
221# elif defined(HAVE_GETAUXVAL)
222 neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
223# elif defined(HAVE_ELF_AUX_INFO)
224 unsigned long hwcap = 0;
225 elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
226 neon_available_ = hwcap & HWCAP_NEON;
227# endif
228#endif
229
230 // Select code for calculation of dot product based on autodetection.
231 if (false) {
232 // This is a dummy to support conditional compilation.
233#if defined(HAVE_AVX512F)
234 } else if (avx512F_available_) {
235 // AVX512F detected.
237#endif
238#if defined(HAVE_AVX2)
239 } else if (avx2_available_) {
240 // AVX2 detected.
242#endif
243#if defined(HAVE_AVX)
244 } else if (avx_available_) {
245 // AVX detected.
247#endif
248#if defined(HAVE_SSE4_1)
249 } else if (sse_available_) {
250 // SSE detected.
252#endif
253#if defined(HAVE_NEON) || defined(__aarch64__)
254 } else if (neon_available_) {
255 // NEON detected.
257#endif
258 }
259
260 const char *dotproduct_env = getenv("DOTPRODUCT");
261 if (dotproduct_env != nullptr) {
262 // Override automatic settings by value from environment variable.
263 dotproduct = dotproduct_env;
264 Update();
265 }
266}
267
269 // Select code for calculation of dot product based on the
270 // value of the config variable if that value is not empty.
271 const char *dotproduct_method = "generic";
272 if (dotproduct == "auto") {
273 // Automatic detection. Nothing to be done.
274 } else if (dotproduct == "generic") {
275 // Generic code selected by config variable.
276 SetDotProduct(DotProductGeneric);
277 dotproduct_method = "generic";
278 } else if (dotproduct == "native") {
279 // Native optimized code selected by config variable.
281 dotproduct_method = "native";
282#if defined(HAVE_AVX2)
283 } else if (dotproduct == "avx2") {
284 // AVX2 selected by config variable.
286 dotproduct_method = "avx2";
287#endif
288#if defined(HAVE_AVX)
289 } else if (dotproduct == "avx") {
290 // AVX selected by config variable.
292 dotproduct_method = "avx";
293#endif
294#if defined(HAVE_FMA)
295 } else if (dotproduct == "fma") {
296 // FMA selected by config variable.
298 dotproduct_method = "fma";
299#endif
300#if defined(HAVE_SSE4_1)
301 } else if (dotproduct == "sse") {
302 // SSE selected by config variable.
304 dotproduct_method = "sse";
305#endif
306#if defined(HAVE_FRAMEWORK_ACCELERATE)
307 } else if (dotproduct == "accelerate") {
308 SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix);
309#endif
310#if defined(HAVE_NEON) || defined(__aarch64__)
311 } else if (dotproduct == "neon" && neon_available_) {
312 // NEON selected by config variable.
314 dotproduct_method = "neon";
315#endif
316 } else if (dotproduct == "std::inner_product") {
317 // std::inner_product selected by config variable.
318 SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix);
319 dotproduct_method = "std::inner_product";
320 } else {
321 // Unsupported value of config variable.
322 tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
323 dotproduct.c_str());
324 tprintf(
325 "Supported values for dotproduct: auto generic native"
326#if defined(HAVE_AVX2)
327 " avx2"
328#endif
329#if defined(HAVE_AVX)
330 " avx"
331#endif
332#if defined(HAVE_FMA)
333 " fma"
334#endif
335#if defined(HAVE_SSE4_1)
336 " sse"
337#endif
338#if defined(HAVE_FRAMEWORK_ACCELERATE)
339 " accelerate"
340#endif
341 " std::inner_product.\n");
342 }
343
344 dotproduct.set_value(dotproduct_method);
345}
346
347} // namespace tesseract
#define HAVE_AVX2
Definition: config_auto.h:26
#define HAVE_SSE4_1
Definition: config_auto.h:74
#define HAVE_AVX
Definition: config_auto.h:23
#define HAVE_FMA
Definition: config_auto.h:44
#define STRING_VAR(name, val, comment)
Definition: params.h:362
TFloat DotProductAVX512F(const TFloat *u, const TFloat *v, int n)
TFloat(*)(const TFloat *, const TFloat *, int) DotProductFunction
Definition: simddetect.h:26
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
TFloat DotProductNEON(const TFloat *u, const TFloat *v, int n)
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n)
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n)
Definition: dotproduct.cpp:22
TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n)
DotProductFunction DotProduct
Definition: simddetect.cpp:79
double TFloat
Definition: tesstypes.h:39
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n)
static const IntSimdMatrix intSimdMatrixAVX2
static const IntSimdMatrix * intSimdMatrix
static const IntSimdMatrix intSimdMatrixSSE
static const IntSimdMatrix intSimdMatrixNEON
static TESS_API void Update()
Definition: simddetect.cpp:268