From 41ccfee7a41c81d09d388ccdf9a20da30c49043b Mon Sep 17 00:00:00 2001 From: Paul Davis Date: Tue, 11 Aug 2015 22:56:55 -0400 Subject: [PATCH] clean up FPU code with some ideas from Chromium and the web --- libs/pbd/fpu.cc | 301 +++++++++++++++++++++++++-------------------- libs/pbd/pbd/fpu.h | 7 +- 2 files changed, 176 insertions(+), 132 deletions(-) diff --git a/libs/pbd/fpu.cc b/libs/pbd/fpu.cc index 93f75f7e2a..5c98ddc3c9 100644 --- a/libs/pbd/fpu.cc +++ b/libs/pbd/fpu.cc @@ -29,6 +29,7 @@ #include #endif +#include "pbd/compose.h" #include "pbd/fpu.h" #include "pbd/error.h" @@ -37,178 +38,216 @@ using namespace PBD; using namespace std; -FPU::FPU () +FPU* FPU::_instance (0); + +#ifndef COMPILER_MSVC + +/* use __cpuid() as the name to match the MSVC intrinsic */ + +static void +__cpuid(int regs[4], int cpuid_leaf) +{ + int eax, ebx, ecx, edx; + asm volatile ( +#if defined(__i386__) + "pushl %%ebx;\n\t" +#endif + "movl %4, %%eax;\n\t" + "cpuid;\n\t" + "movl %%eax, %0;\n\t" + "movl %%ebx, %1;\n\t" + "movl %%ecx, %2;\n\t" + "movl %%edx, %3;\n\t" +#if defined(__i386__) + "popl %%ebx;\n\t" +#endif + :"=m" (eax), "=m" (ebx), "=m" (ecx), "=m" (edx) + :"r" (cpuid_leaf) + :"%eax", +#if !defined(__i386__) + "%ebx", +#endif + "%ecx", "%edx"); + + regs[0] = eax; + regs[1] = ebx; + regs[2] = ecx; + regs[3] = edx; +} + +static uint64_t +_xgetbv (uint32_t xcr) { - unsigned long cpuflags = 0; + uint32_t eax, edx; + __asm__ volatile ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (xcr)); + return (static_cast(edx) << 32) | eax; +} + +#define _XCR_XFEATURE_ENABLED_MASK 0 - _flags = Flags (0); +#endif /* !COMPILER_MSVC */ + +FPU* +FPU::instance() +{ + if (!_instance) { + _instance = new FPU; + } + + return _instance; +} + +FPU::FPU () + : _flags ((Flags) 0) +{ + if (_instance) { + error << _("FPU object instantiated more than once") << endmsg; + } #if !( (defined __x86_64__) || (defined __i386__) || (defined _M_X64) || (defined _M_IX86) ) // !ARCH_X86 + /* Non-Intel architecture, nothing to do here */ return; #else -#ifdef PLATFORM_WINDOWS + /* Get the CPU vendor just for kicks */ - // Get CPU flags using Microsoft function - // It works for both 64 and 32 bit systems - // no need to use assembler for getting info from register, this function does this for us - int cpuInfo[4]; - __cpuid (cpuInfo, 1); - cpuflags = cpuInfo[3]; + // __cpuid with an InfoType argument of 0 returns the number of + // valid Ids in CPUInfo[0] and the CPU identification string in + // the other three array elements. The CPU identification string is + // not in linear order. The code below arranges the information + // in a human readable form. The human readable order is CPUInfo[1] | + // CPUInfo[3] | CPUInfo[2]. CPUInfo[2] and CPUInfo[3] are swapped + // before using memcpy to copy these three array elements to cpu_string. -#else + int cpu_info[4]; + char cpu_string[48]; + string cpu_vendor; -#ifndef _LP64 /* *nix; 32 bit version. This odd macro constant is required because we need something that identifies this as a 32 bit - build on Linux and on OS X. Anything that serves this purpose will do, but this is the best thing we've identified - so far. - */ - - asm volatile ( - "mov $1, %%eax\n" - "pushl %%ebx\n" - "cpuid\n" - "movl %%edx, %0\n" - "popl %%ebx\n" - : "=r" (cpuflags) - : - : "%eax", "%ecx", "%edx" - ); + __cpuid (cpu_info, 0); + + int num_ids = cpu_info[0]; + std::swap(cpu_info[2], cpu_info[3]); + memcpy(cpu_string, &cpu_info[1], 3 * sizeof(cpu_info[1])); + cpu_vendor.assign(cpu_string, 3 * sizeof(cpu_info[1])); + + info << string_compose (_("CPU vendor: %1"), cpu_vendor) << endmsg; + + if (num_ids > 0) { -#else /* *nix; 64 bit version */ + /* Now get CPU/FPU flags */ - /* asm notes: although we explicitly save&restore rbx, we must tell - gcc that ebx,rbx is clobbered so that it doesn't try to use it as an intermediate - register when storing rbx. gcc 4.3 didn't make this "mistake", but gcc 4.4 - does, at least on x86_64. - */ - - asm volatile ( - "pushq %%rbx\n" - "movq $1, %%rax\n" - "cpuid\n" - "movq %%rdx, %0\n" - "popq %%rbx\n" - : "=r" (cpuflags) - : - : "%rax", "%rbx", "%rcx", "%rdx" - ); - -#endif /* _LP64 */ -#endif /* PLATFORM_WINDOWS */ - -#ifndef __APPLE__ - /* must check for both AVX and OSXSAVE support in cpuflags before - * attempting to use AVX related instructions. - */ - if ((cpuflags & (1<<27)) /* AVX */ && (cpuflags & (1<<28) /* (OS)XSAVE */)) { - - std::cerr << "Looks like AVX\n"; - - /* now check if YMM resters state is saved: which means OS does - * know about new YMM registers and saves them during context - * switches it's true for most cases, but we must be sure - * - * giving 0 as the argument to _xgetbv() fetches the - * XCR_XFEATURE_ENABLED_MASK, which we need to check for - * the 2nd and 3rd bits, indicating correct register save/restore. - */ - - uint64_t xcrFeatureMask = 0; - -#if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4 - unsigned int eax, edx, index = 0; - asm volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); - xcrFeatureMask = ((unsigned long long)edx << 32) | eax; -#elif defined (COMPILER_MSVC) - xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); -#endif - if (xcrFeatureMask & 0x6) { - std::cerr << "Definitely AVX\n"; + __cpuid (cpu_info, 1); + + if ((cpu_info[2] & (1<<27)) /* AVX */ && + (cpu_info[2] & (1<<28) /* (OS)XSAVE */) && + (_xgetbv (_XCR_XFEATURE_ENABLED_MASK) & 0x6)) { /* OS really supports XSAVE */ + info << _("AVX-capable processor") << endmsg; _flags = Flags (_flags | (HasAVX) ); } - } -#endif /* !__APPLE__ */ - if (cpuflags & (1<<25)) { - _flags = Flags (_flags | (HasSSE|HasFlushToZero)); - } + if (cpu_info[3] & (1<<25)) { + _flags = Flags (_flags | (HasSSE|HasFlushToZero)); + } - if (cpuflags & (1<<26)) { - _flags = Flags (_flags | HasSSE2); - } + if (cpu_info[3] & (1<<26)) { + _flags = Flags (_flags | HasSSE2); + } - if (cpuflags & (1 << 24)) { + /* Figure out CPU/FPU denormal handling capabilities */ + + if (cpu_info[3] & (1 << 24)) { - char** fxbuf = 0; + char** fxbuf = 0; - /* DAZ wasn't available in the first version of SSE. Since - setting a reserved bit in MXCSR causes a general protection - fault, we need to be able to check the availability of this - feature without causing problems. To do this, one needs to - set up a 512-byte area of memory to save the SSE state to, - using fxsave, and then one needs to inspect bytes 28 through - 31 for the MXCSR_MASK value. If bit 6 is set, DAZ is - supported, otherwise, it isn't. - */ + /* DAZ wasn't available in the first version of SSE. Since + setting a reserved bit in MXCSR causes a general protection + fault, we need to be able to check the availability of this + feature without causing problems. To do this, one needs to + set up a 512-byte area of memory to save the SSE state to, + using fxsave, and then one needs to inspect bytes 28 through + 31 for the MXCSR_MASK value. If bit 6 is set, DAZ is + supported, otherwise, it isn't. + */ #ifndef HAVE_POSIX_MEMALIGN # ifdef PLATFORM_WINDOWS - fxbuf = (char **) _aligned_malloc (sizeof (char *), 16); - assert (fxbuf); - *fxbuf = (char *) _aligned_malloc (512, 16); - assert (*fxbuf); + fxbuf = (char **) _aligned_malloc (sizeof (char *), 16); + assert (fxbuf); + *fxbuf = (char *) _aligned_malloc (512, 16); + assert (*fxbuf); # else # warning using default malloc for aligned memory - fxbuf = (char **) malloc (sizeof (char *)); - assert (fxbuf); - *fxbuf = (char *) malloc (512); - assert (*fxbuf); + fxbuf = (char **) malloc (sizeof (char *)); + assert (fxbuf); + *fxbuf = (char *) malloc (512); + assert (*fxbuf); # endif #else - (void) posix_memalign ((void **) &fxbuf, 16, sizeof (char *)); - assert (fxbuf); - (void) posix_memalign ((void **) fxbuf, 16, 512); - assert (*fxbuf); + (void) posix_memalign ((void **) &fxbuf, 16, sizeof (char *)); + assert (fxbuf); + (void) posix_memalign ((void **) fxbuf, 16, 512); + assert (*fxbuf); #endif - memset (*fxbuf, 0, 512); + memset (*fxbuf, 0, 512); #ifdef COMPILER_MSVC - char *buf = *fxbuf; - __asm { - mov eax, buf - fxsave [eax] - }; + char *buf = *fxbuf; + __asm { + mov eax, buf + fxsave [eax] + }; #else - asm volatile ( - "fxsave (%0)" - : - : "r" (*fxbuf) - : "memory" - ); + asm volatile ( + "fxsave (%0)" + : + : "r" (*fxbuf) + : "memory" + ); #endif - uint32_t mxcsr_mask = *((uint32_t*) &((*fxbuf)[28])); + uint32_t mxcsr_mask = *((uint32_t*) &((*fxbuf)[28])); - /* if the mask is zero, set its default value (from intel specs) */ + /* if the mask is zero, set its default value (from intel specs) */ - if (mxcsr_mask == 0) { - mxcsr_mask = 0xffbf; - } + if (mxcsr_mask == 0) { + mxcsr_mask = 0xffbf; + } - if (mxcsr_mask & (1<<6)) { - _flags = Flags (_flags | HasDenormalsAreZero); - } + if (mxcsr_mask & (1<<6)) { + _flags = Flags (_flags | HasDenormalsAreZero); + } #if !defined HAVE_POSIX_MEMALIGN && defined PLATFORM_WINDOWS - _aligned_free (*fxbuf); - _aligned_free (fxbuf); + _aligned_free (*fxbuf); + _aligned_free (fxbuf); #else - free (*fxbuf); - free (fxbuf); + free (*fxbuf); + free (fxbuf); #endif - } + } #endif + + /* finally get the CPU brand */ + + __cpuid (cpu_info, 0x80000000); + + const int parameter_end = 0x80000004; + string cpu_brand; + + if (cpu_info[0] >= parameter_end) { + char* cpu_string_ptr = cpu_string; + + for (int parameter = 0x80000002; parameter <= parameter_end && + cpu_string_ptr < &cpu_string[sizeof(cpu_string)]; parameter++) { + __cpuid(cpu_info, parameter); + memcpy(cpu_string_ptr, cpu_info, sizeof(cpu_info)); + cpu_string_ptr += sizeof(cpu_info); + } + cpu_brand.assign(cpu_string, cpu_string_ptr - cpu_string); + info << string_compose (_("CPU brand: %1"), cpu_brand) << endmsg; + } + } } FPU::~FPU () diff --git a/libs/pbd/pbd/fpu.h b/libs/pbd/pbd/fpu.h index 260cf4db85..4ab1a83561 100644 --- a/libs/pbd/pbd/fpu.h +++ b/libs/pbd/pbd/fpu.h @@ -35,9 +35,10 @@ class LIBPBD_API FPU { }; public: - FPU (); ~FPU (); + static FPU* instance(); + bool has_flush_to_zero () const { return _flags & HasFlushToZero; } bool has_denormals_are_zero () const { return _flags & HasDenormalsAreZero; } bool has_sse () const { return _flags & HasSSE; } @@ -46,6 +47,10 @@ class LIBPBD_API FPU { private: Flags _flags; + + static FPU* _instance; + + FPU (); }; } -- 2.30.2