28#if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)
31#undef HAVE_FRAMEWORK_ACCELERATE
34#if defined(HAVE_FRAMEWORK_ACCELERATE)
39#include <Accelerate/Accelerate.h>
43#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
55#if defined(HAVE_NEON) && !defined(__aarch64__)
56# if defined(HAVE_ANDROID_GETCPUFAMILY)
57# include <cpu-features.h>
58# elif defined(HAVE_GETAUXVAL)
59# include <asm/hwcap.h>
61# elif defined(HAVE_ELF_AUX_INFO)
81static STRING_VAR(dotproduct,
"auto",
"Function used for calculation of dot product");
85#if defined(__aarch64__)
87bool SIMDDetect::neon_available_ =
true;
88#elif defined(HAVE_NEON)
90bool SIMDDetect::neon_available_;
93bool SIMDDetect::avx_available_;
94bool SIMDDetect::avx2_available_;
95bool SIMDDetect::avx512F_available_;
96bool SIMDDetect::avx512BW_available_;
98bool SIMDDetect::fma_available_;
100bool SIMDDetect::sse_available_;
103#if defined(HAVE_FRAMEWORK_ACCELERATE)
106 const int stride = 1;
107#if defined(FAST_FLOAT)
108 vDSP_dotpr(u, stride, v, stride, &total, n);
110 vDSP_dotprD(u, stride, v, stride, &total, n);
119 for (
int k = 0; k < n; ++k) {
120 total += u[k] * v[k];
127 return std::inner_product(u, u + n, v,
static_cast<TFloat>(0));
140SIMDDetect::SIMDDetect() {
142 SetDotProduct(DotProductGeneric);
144#if defined(HAS_CPUID)
145# if defined(__GNUC__)
146 unsigned int eax, ebx, ecx, edx;
147 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
150# if defined(HAVE_SSE4_1)
151 sse_available_ = (ecx & 0x00080000) != 0;
153# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
156 __asm__(
"xgetbv" :
"=a"(xcr0) :
"c"(0) :
"%edx");
159 if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
161# if defined(HAVE_FMA)
162 fma_available_ = (ecx & 0x00001000) != 0;
164# if defined(HAVE_AVX)
165 avx_available_ = (ecx & 0x10000000) != 0;
166 if (avx_available_) {
170 __cpuid_count(7, 0, eax, ebx, ecx, edx);
171 avx2_available_ = (ebx & 0x00000020) != 0;
172 avx512F_available_ = (ebx & 0x00010000) != 0;
173 avx512BW_available_ = (ebx & 0x40000000) != 0;
179# elif defined(_WIN32)
183 max_function_id = cpuInfo[0];
184 if (max_function_id >= 1) {
186# if defined(HAVE_SSE4_1)
187 sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
189# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
190 if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
192# if defined(HAVE_FMA)
193 fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
195# if defined(HAVE_AVX)
196 avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
198# if defined(HAVE_AVX2)
199 if (max_function_id >= 7) {
201 avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
202 avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
203 avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
210# error "I don't know how to test for SIMD with this compiler"
214#if defined(HAVE_NEON) && !defined(__aarch64__)
215# if defined(HAVE_ANDROID_GETCPUFAMILY)
217 AndroidCpuFamily family = android_getCpuFamily();
218 if (family == ANDROID_CPU_FAMILY_ARM)
219 neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
221# elif defined(HAVE_GETAUXVAL)
222 neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
223# elif defined(HAVE_ELF_AUX_INFO)
224 unsigned long hwcap = 0;
225 elf_aux_info(AT_HWCAP, &hwcap,
sizeof hwcap);
226 neon_available_ = hwcap & HWCAP_NEON;
233#if defined(HAVE_AVX512F)
234 }
else if (avx512F_available_) {
238#if defined(HAVE_AVX2)
239 }
else if (avx2_available_) {
244 }
else if (avx_available_) {
248#if defined(HAVE_SSE4_1)
249 }
else if (sse_available_) {
253#if defined(HAVE_NEON) || defined(__aarch64__)
254 }
else if (neon_available_) {
260 const char *dotproduct_env = getenv(
"DOTPRODUCT");
261 if (dotproduct_env !=
nullptr) {
263 dotproduct = dotproduct_env;
271 const char *dotproduct_method =
"generic";
272 if (dotproduct ==
"auto") {
274 }
else if (dotproduct ==
"generic") {
276 SetDotProduct(DotProductGeneric);
277 dotproduct_method =
"generic";
278 }
else if (dotproduct ==
"native") {
281 dotproduct_method =
"native";
282#if defined(HAVE_AVX2)
283 }
else if (dotproduct ==
"avx2") {
286 dotproduct_method =
"avx2";
289 }
else if (dotproduct ==
"avx") {
292 dotproduct_method =
"avx";
295 }
else if (dotproduct ==
"fma") {
298 dotproduct_method =
"fma";
300#if defined(HAVE_SSE4_1)
301 }
else if (dotproduct ==
"sse") {
304 dotproduct_method =
"sse";
306#if defined(HAVE_FRAMEWORK_ACCELERATE)
307 }
else if (dotproduct ==
"accelerate") {
310#if defined(HAVE_NEON) || defined(__aarch64__)
311 }
else if (dotproduct ==
"neon" && neon_available_) {
314 dotproduct_method =
"neon";
316 }
else if (dotproduct ==
"std::inner_product") {
319 dotproduct_method =
"std::inner_product";
322 tprintf(
"Warning, ignoring unsupported config variable value: dotproduct=%s\n",
325 "Supported values for dotproduct: auto generic native"
338#
if defined(HAVE_FRAMEWORK_ACCELERATE)
341 " std::inner_product.\n");
344 dotproduct.set_value(dotproduct_method);
#define STRING_VAR(name, val, comment)
TFloat DotProductAVX512F(const TFloat *u, const TFloat *v, int n)
TFloat(*)(const TFloat *, const TFloat *, int) DotProductFunction
void tprintf(const char *format,...)
TFloat DotProductNEON(const TFloat *u, const TFloat *v, int n)
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n)
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n)
TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n)
DotProductFunction DotProduct
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n)
static const IntSimdMatrix intSimdMatrixAVX2
static const IntSimdMatrix * intSimdMatrix
static const IntSimdMatrix intSimdMatrixSSE
static const IntSimdMatrix intSimdMatrixNEON
static TESS_API void Update()