+FPU* FPU::_instance (0);
+
+#if ( (defined __x86_64__) || (defined __i386__) || (defined _M_X64) || (defined _M_IX86) ) // ARCH_X86
+#ifndef PLATFORM_WINDOWS
+
+/* use __cpuid() as the name to match the MSVC/mingw intrinsic */
+
+static void
+__cpuid(int regs[4], int cpuid_leaf)
+{
+ asm volatile (
+#if defined(__i386__)
+ "pushl %%ebx;\n\t"
+#endif
+ "cpuid;\n\t"
+ "movl %%eax, (%1);\n\t"
+ "movl %%ebx, 4(%1);\n\t"
+ "movl %%ecx, 8(%1);\n\t"
+ "movl %%edx, 12(%1);\n\t"
+#if defined(__i386__)
+ "popl %%ebx;\n\t"
+#endif
+ :"=a" (cpuid_leaf) /* %eax clobbered by CPUID */
+ :"S" (regs), "a" (cpuid_leaf)
+ :
+#if !defined(__i386__)
+ "%ebx",
+#endif
+ "%ecx", "%edx", "memory");
+}
+
+#endif /* !PLATFORM_WINDOWS */
+
+#ifndef COMPILER_MSVC
+
+static uint64_t
+_xgetbv (uint32_t xcr)
+{
+#ifdef __APPLE__
+ /* it would be nice to make this work on OS X but as long we use veclib,
+ we don't really need to know about SSE/AVX on that platform.
+ */
+ return 0;
+#else
+ uint32_t eax, edx;
+ __asm__ volatile ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (xcr));
+ return (static_cast<uint64_t>(edx) << 32) | eax;
+#endif
+}
+
+#elif _MSC_VER < 1600
+
+// '_xgetbv()' was only available from VC10 onwards
+__declspec(noinline) static uint64_t
+_xgetbv (uint32_t xcr)
+{
+ return 0;
+
+ // N.B. The following would probably work for a pre-VC10 build,
+ // although it might suffer from optimization issues. We'd need
+ // to place this function into its own (unoptimized) source file.
+ __asm {
+ mov ecx, [xcr]
+ __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0 /*xgetbv*/
+ }
+}
+
+#endif /* !COMPILER_MSVC */
+#endif /* ARCH_X86 */
+
+#ifndef _XCR_XFEATURE_ENABLED_MASK
+#define _XCR_XFEATURE_ENABLED_MASK 0
+#endif
+
+FPU*
+FPU::instance()
+{
+ if (!_instance) {
+ _instance = new FPU;
+ }
+
+ return _instance;
+}
+
+void
+FPU::destroy ()
+{
+ delete _instance;
+ _instance = 0;
+}
+