libs/soundtouch/mmx_gcc.cpp

   1 ////////////////////////////////////////////////////////////////////////////////
   2 ///
   3 /// gcc version of the MMX optimized routines. All MMX optimized functions
   4 /// have been gathered into this single source code file, regardless to their
   5 /// class or original source code file, in order to ease porting the library
   6 /// to other compiler and processor platforms.
   7 ///
   8 /// This file is to be compiled on any platform with the GNU C compiler.
   9 /// Compiler. Please see 'mmx_win.cpp' for the x86 Windows version of this
  10 /// file.
  11 ///
  12 /// Author        : Copyright (c) Olli Parviainen
  13 /// Author e-mail : oparviai @ iki.fi
  14 /// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch
  15 ///
  16 ////////////////////////////////////////////////////////////////////////////////
  17 //
  18 // Last changed  : $Date$
  19 // File revision : $Revision$
  20 //
  21 // $Id$
  22 //
  23 ////////////////////////////////////////////////////////////////////////////////
  24 //
  25 // License :
  26 //
  27 //  SoundTouch audio processing library
  28 //  Copyright (c) Olli Parviainen
  29 //
  30 //  This library is free software; you can redistribute it and/or
  31 //  modify it under the terms of the GNU Lesser General Public
  32 //  License as published by the Free Software Foundation; either
  33 //  version 2.1 of the License, or (at your option) any later version.
  34 //
  35 //  This library is distributed in the hope that it will be useful,
  36 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  37 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  38 //  Lesser General Public License for more details.
  39 //
  40 //  You should have received a copy of the GNU Lesser General Public
  41 //  License along with this library; if not, write to the Free Software
  42 //  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  43 //
  44 ////////////////////////////////////////////////////////////////////////////////
  45
  46 #include <stdexcept>
  47 #include <string>
  48 #include "cpu_detect.h"
  49
  50 #ifndef __GNUC__
  51 #error "wrong platform - this source code file is for the GNU C compiler."
  52 #endif
  53
  54 using namespace std;
  55 using namespace soundtouch;
  56
  57
  58 #ifdef ALLOW_MMX
  59 // MMX routines available only with integer sample type
  60
  61 //////////////////////////////////////////////////////////////////////////////
  62 //
  63 // implementation of MMX optimized functions of class 'TDStretch'
  64 //
  65 // NOTE: ebx in gcc 3.x is not preserved if -fPIC and -DPIC
  66 //  gcc-3.4 correctly flags this error and wont let you continue.
  67 //  gcc-2.95 preserves esi correctly
  68 //
  69 //////////////////////////////////////////////////////////////////////////////
  70
  71 #include "TDStretch.h"
  72 #include <limits.h>
  73
  74 // these are declared in 'TDStretch.cpp'
  75 extern int scanOffsets[4][24];
  76
  77 // Calculates cross correlation of two buffers
  78 long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
  79 {
  80 #ifdef __i386__
  81     int corr;
  82     uint local_overlapLength = overlapLength;
  83     uint local_overlapDividerBits = overlapDividerBits;
  84
  85     asm volatile(
  86         // Calculate cross-correlation between the tempOffset and tmpbid_buffer.
  87
  88         // Process 4 parallel batches of 2 * stereo samples each during one
  89         // round to improve CPU-level parallellization.
  90
  91         // load address of sloped pV2 buffer to eax
  92         // load address of mixing point of the sample data buffer to edi
  93         // load counter to ecx = overlapLength / 8 - 1
  94         // empty the mm0
  95
  96         // prepare to the first round by loading
  97         // load mm1 = eax[0]
  98         // load mm2 = eax[1];
  99
 100         "\n\tmovl        %1, %%eax"
 101         "\n\tmovl        %2, %%edi"
 102
 103         "\n\tmovq        (%%eax), %%mm1"
 104         "\n\tmovl        %3, %%ecx"
 105
 106         "\n\tmovq        8(%%eax), %%mm2"
 107         "\n\tshr         $3, %%ecx"
 108
 109         "\n\tpxor        %%mm0, %%mm0"
 110         "\n\tsub         $1, %%ecx"
 111
 112         "\n\tmovd        %4, %%mm5"
 113
 114         "\n1:"
 115         // multiply-add mm1 = mm1 * edi[0]
 116         // multiply-add mm2 = mm2 * edi[1]
 117         //
 118         // add mm2 += mm1
 119         // mm2 >>= mm5 (=overlapDividerBits)
 120         // add mm0 += mm2
 121         //
 122         // load mm3 = eax[2]
 123         // multiply-add mm3 = mm3 * edi[2]
 124         //
 125         // load mm4 = eax[3]
 126         // multiply-add mm4 = mm4 * edi[3]
 127         //
 128         // add mm3 += mm4
 129         // mm3 >>= mm5 (=overlapDividerBits)
 130         // add mm0 += mm3
 131         //
 132         // add eax += 4
 133         // add edi += 4
 134         // load mm1 = eax[0] (~eax[4])
 135         // load mm2 = eax[1] (~eax[5])
 136         //
 137         // loop
 138
 139         "\n\tpmaddwd     (%%edi), %%mm1"   // qword ptr [edi]
 140         "\n\tmovq        16(%%eax), %%mm3" // qword ptr [eax+16]
 141
 142         "\n\tpmaddwd     8(%%edi), %%mm2"  // qword ptr [edi+8]
 143         "\n\tmovq        24(%%eax), %%mm4" // qword ptr [eax+24]
 144
 145         "\n\tpmaddwd     16(%%edi), %%mm3" // qword ptr [edi+16]
 146         "\n\tpaddd       %%mm1, %%mm2"
 147
 148         "\n\tpmaddwd     24(%%edi), %%mm4" // qword ptr [edi+24]
 149         "\n\tmovq        32(%%eax), %%mm1" // qword ptr [eax+32]
 150
 151         "\n\tpsrad       %%mm5, %%mm2"
 152         "\n\tadd         $32, %%eax"
 153
 154         "\n\tpaddd       %%mm4, %%mm3"
 155         "\n\tpaddd       %%mm2, %%mm0"
 156
 157         "\n\tmovq        8(%%eax), %%mm2"  // qword ptr [eax+8]
 158         "\n\tpsrad       %%mm5, %%mm3"
 159
 160         "\n\tadd         $32, %%edi"
 161         "\n\tpaddd       %%mm3, %%mm0"
 162
 163         "\n\tdec         %%ecx"
 164         "\n\tjnz         1b"
 165
 166         // Finalize the last partial loop:
 167
 168         "\n\tmovq        16(%%eax), %%mm3" // qword ptr [eax+16]
 169         "\n\tpmaddwd     (%%edi), %%mm1"   // qword ptr [edi]
 170
 171         "\n\tmovq        24(%%eax), %%mm4" // qword ptr [eax+24]
 172         "\n\tpmaddwd     8(%%edi), %%mm2"  // qword ptr [edi+8]
 173
 174         "\n\tpmaddwd     16(%%edi), %%mm3" // qword ptr [edi+16]
 175         "\n\tpaddd       %%mm1, %%mm2"
 176
 177         "\n\tpmaddwd     24(%%edi), %%mm4" // qword ptr [edi+24]
 178         "\n\tpsrad       %%mm5, %%mm2"
 179
 180         "\n\tpaddd       %%mm4, %%mm3"
 181         "\n\tpaddd       %%mm2, %%mm0"
 182
 183         "\n\tpsrad       %%mm5, %%mm3"
 184         "\n\tpaddd       %%mm3, %%mm0"
 185
 186         // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
 187         // and finally store the result into the variable "corr"
 188
 189         "\n\tmovq        %%mm0, %%mm1"
 190         "\n\tpsrlq       $32, %%mm1"
 191         "\n\tpaddd       %%mm1, %%mm0"
 192         "\n\tmovd        %%mm0, %0"
 193       : "=rm" (corr)
 194       : "rim" (pV1), "rim" (pV2), "rim" (local_overlapLength),
 195         "rim" (local_overlapDividerBits)
 196       : "%ecx", "%eax", "%edi"
 197     );
 198     return corr;
 199
 200     // Note: Warning about the missing EMMS instruction is harmless
 201     // as it'll be called elsewhere.
 202 #else
 203     throw runtime_error("MMX not supported");
 204 #endif
 205 }
 206
 207 void TDStretchMMX::clearCrossCorrState()
 208 {
 209 #ifdef __i386__
 210     asm volatile("EMMS");
 211 #endif
 212 }
 213
 214 // MMX-optimized version of the function overlapStereo
 215 void TDStretchMMX::overlapStereo(short *output, const short *input) const
 216 {
 217 #ifdef __i386__
 218     short *local_midBuffer = pMidBuffer;
 219     uint local_overlapLength = overlapLength;
 220     uint local_overlapDividerBits = overlapDividerBits;
 221
 222     asm volatile(
 223         "\n\t"
 224         // load sliding mixing value counter to mm6 and mm7
 225         // load counter value to ecx = overlapLength / 4
 226         // load divider-shifter value to esi
 227         // load mixing value adder to mm5
 228         // load address of midBuffer to eax
 229         // load address of inputBuffer added with ovlOffset to edi
 230         // load address of end of the outputBuffer to edx
 231         //
 232         // We need to preserve esi, since gcc uses it for the
 233         // stack frame.
 234
 235         "movl        %0, %%eax\n\t"               // ecx = 0x0000 OVL_
 236         "movl        $0x0002fffe, %%edi\n\t"      // ecx = 0x0002 fffe
 237
 238         "movl        %1, %%esi\n\t"
 239         "movd        %%eax, %%mm6\n\t"            // mm6 = 0x0000 0000 0000 OVL_
 240
 241         "movl        %%eax, %%ecx\n\t"
 242         "sub         $1, %%eax\n\t"
 243
 244         "punpckldq   %%mm6, %%mm6\n\t"            // mm6 = 0x0000 OVL_ 0000 OVL_
 245
 246         "or          $0x00010000, %%eax\n\t"      // eax = 0x0001 overlapLength-1
 247
 248         "movd        %%edi, %%mm5\n\t"            // mm5 = 0x0000 0000 0002 fffe
 249         "movd        %%eax, %%mm7\n\t"            // mm7 = 0x0000 0000 0001 01ff
 250
 251         "movl        %3, %%edi\n\t"
 252
 253         "movl        %4, %%eax\n\t"               // dword ptr local_midBuffer
 254         "punpckldq   %%mm5, %%mm5\n\t"            // mm5 = 0x0002 fffe 0002 fffe
 255
 256         "shr         $2, %%ecx\n\t"               // ecx = overlapLength / 2
 257         "punpckldq   %%mm7, %%mm7\n\t"            // mm7 = 0x0001 01ff 0001 01ff
 258
 259         "movl        %2, %%edx\n"
 260
 261         "2:\n\t"
 262         // Process two parallel batches of 2+2 stereo samples during each round
 263         // to improve CPU-level parallellization.
 264         //
 265         // Load [eax] into mm0 and mm1
 266         // Load [edi] into mm3
 267         // unpack words of mm0, mm1 and mm3 into mm0 and mm1
 268         // multiply-add mm0*mm6 and mm1*mm7, store results into mm0 and mm1
 269         // divide mm0 and mm1 by 512 (=right-shift by overlapDividerBits)
 270         // pack the result into mm0 and store into [edx]
 271         //
 272         // Load [eax+8] into mm2 and mm3
 273         // Load [edi+8] into mm4
 274         // unpack words of mm2, mm3 and mm4 into mm2 and mm3
 275         // multiply-add mm2*mm6 and mm3*mm7, store results into mm2 and mm3
 276         // divide mm2 and mm3 by 512 (=right-shift by overlapDividerBits)
 277         // pack the result into mm2 and store into [edx+8]
 278
 279
 280         "movq        (%%eax), %%mm0\n\t"             // mm0 = m1l m1r m0l m0r
 281         "add         $16, %%edx\n\t"
 282
 283         "movq        (%%edi), %%mm3\n\t"             // mm3 = i1l i1r i0l i0r
 284         "movq        %%mm0, %%mm1\n\t"               // mm1 = m1l m1r m0l m0r
 285
 286         "movq        8(%%eax), %%mm2\n\t"            // mm2 = m3l m3r m2l m2r
 287         "punpcklwd   %%mm3, %%mm0\n\t"               // mm0 = i0l m0l i0r m0r
 288
 289         "movq        8(%%edi), %%mm4\n\t"            // mm4 = i3l i3r i2l i2r
 290         "punpckhwd   %%mm3, %%mm1\n\t"               // mm1 = i1l m1l i1r m1r
 291
 292         "movq        %%mm2, %%mm3\n\t"               // mm3 = m3l m3r m2l m2r
 293         "punpcklwd   %%mm4, %%mm2\n\t"               // mm2 = i2l m2l i2r m2r
 294
 295         "pmaddwd     %%mm6, %%mm0\n\t"               // mm0 = i0l*m63+m0l*m62 i0r*m61+m0r*m60
 296         "punpckhwd   %%mm4, %%mm3\n\t"               // mm3 = i3l m3l i3r m3r
 297
 298         "movd        %%esi, %%mm4\n\t"               // mm4 = overlapDividerBits
 299
 300         "pmaddwd     %%mm7, %%mm1\n\t"               // mm1 = i1l*m73+m1l*m72 i1r*m71+m1r*m70
 301         "paddw       %%mm5, %%mm6\n\t"
 302
 303         "paddw       %%mm5, %%mm7\n\t"
 304         "psrad       %%mm4, %%mm0\n\t"               // mmo >>= overlapDividerBits
 305
 306         "pmaddwd     %%mm6, %%mm2\n\t"               // mm2 = i2l*m63+m2l*m62 i2r*m61+m2r*m60
 307         "psrad       %%mm4, %%mm1\n\t"               // mm1 >>= overlapDividerBits
 308
 309         "pmaddwd     %%mm7, %%mm3\n\t"               // mm3 = i3l*m73+m3l*m72 i3r*m71+m3r*m70
 310         "psrad       %%mm4, %%mm2\n\t"               // mm2 >>= overlapDividerBits
 311
 312         "packssdw    %%mm1, %%mm0\n\t"               // mm0 = mm1h mm1l mm0h mm0l
 313         "psrad       %%mm4, %%mm3\n\t"               // mm3 >>= overlapDividerBits
 314
 315         "add         $16, %%eax\n\t"
 316         "paddw       %%mm5, %%mm6\n\t"
 317
 318         "packssdw    %%mm3, %%mm2\n\t"               // mm2 = mm2h mm2l mm3h mm3l
 319         "paddw       %%mm5, %%mm7\n\t"
 320
 321         "movq        %%mm0, -16(%%edx)\n\t"
 322         "add         $16, %%edi\n\t"
 323
 324         "movq        %%mm2, -8(%%edx)\n\t"
 325         "dec         %%ecx\n\t"
 326
 327         "jnz         2b\n\t"
 328
 329         "emms\n\t"
 330
 331       :
 332       : "rim" (local_overlapLength),
 333         "rim" (local_overlapDividerBits),
 334         "rim" (output),
 335         "rim" (input),
 336         "rim" (local_midBuffer)
 337       /* input */
 338       : "%edi", "%ecx", "%edx", "%eax", "%esi" /* regs */
 339     );
 340 #else
 341     throw runtime_error("MMX not supported");
 342 #endif
 343 }
 344
 345
 346 //////////////////////////////////////////////////////////////////////////////
 347 //
 348 // implementation of MMX optimized functions of class 'FIRFilter'
 349 //
 350 //////////////////////////////////////////////////////////////////////////////
 351
 352 #include "FIRFilter.h"
 353
 354 FIRFilterMMX::FIRFilterMMX() : FIRFilter()
 355 {
 356     filterCoeffsUnalign = NULL;
 357 }
 358
 359
 360 FIRFilterMMX::~FIRFilterMMX()
 361 {
 362     delete[] filterCoeffsUnalign;
 363 }
 364
 365
 366 #if 1
 367 // (overloaded) Calculates filter coefficients for MMX routine
 368 void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
 369 {
 370 #ifdef __i386__
 371     uint i;
 372     FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
 373
 374     // Ensure that filter coeffs array is aligned to 16-byte boundary
 375     delete[] filterCoeffsUnalign;
 376     filterCoeffsUnalign = new short[2 * newLength + 8];
 377     filterCoeffsAlign = (short *)(((uint)filterCoeffsUnalign + 15) & -16);
 378
 379     // rearrange the filter coefficients for mmx routines
 380     for (i = 0;i < length; i += 4)
 381     {
 382         filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
 383         filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
 384         filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
 385         filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];
 386
 387         filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
 388         filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
 389         filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
 390         filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
 391     }
 392 #else
 393     throw runtime_error("MMX not supported");
 394 #endif
 395 }
 396
 397
 398
 399 // mmx-optimized version of the filter routine for stereo sound
 400 uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uint numSamples) const
 401 {
 402 #ifdef __i386__
 403     // Create stack copies of the needed member variables for asm routines :
 404     uint local_length = length;
 405     uint local_lengthDiv8 = lengthDiv8;
 406     uint local_resultDivider = resultDivFactor;
 407     short *local_filterCoeffs = (short*)filterCoeffsAlign;
 408     short *local_src = (short *)src;
 409
 410     asm volatile(
 411         "\n\t"
 412         // Load (num_samples-aa_filter_length)/2 to edi as a i
 413         // Load a pointer to samples to esi
 414         // Load a pointer to destination to edx
 415
 416         "movl        %0, %%edi\n\t"
 417         "subl        %2, %%edi\n\t"
 418         "movl        %3, %%edx\n\t"
 419         "sar         $1, %%edi\n"
 420
 421         // Load filter length/8 to ecx
 422         // Load pointer to samples from esi to ebx
 423         // Load counter from edi to ecx
 424         // Load [ebx] to mm3
 425         // Load pointer to filter coefficients to eax
 426         "3:\n\t"
 427         "movl        %1, %%esi\n\t"
 428         "pxor        %%mm0, %%mm0\n\t"
 429
 430         "movl        %4, %%ecx\n\t"
 431         "pxor        %%mm7, %%mm7\n\t"
 432
 433         "movq        (%%esi), %%mm1\n\t"            // mm1 = l1 r1 l0 r0
 434         "movl        %5, %%eax\n"
 435         "4:\n\t"
 436
 437         "movq        8(%%esi), %%mm2\n\t"           // mm2 = l3 r3 l2 r2
 438         "movq        %%mm1, %%mm4\n\t"              // mm4 = l1 r1 l0 r0
 439
 440         "movq        16(%%esi), %%mm3\n\t"          // mm3 = l5 r5 l4 r4
 441         "punpckhwd   %%mm2, %%mm1\n\t"              // mm1 = l3 l1 r3 r1
 442
 443         "movq        %%mm2, %%mm6\n\t"              // mm6 = l3 r3 l2 r2
 444         "punpcklwd   %%mm2, %%mm4\n\t"              // mm4 = l2 l0 r2 r0
 445
 446         "movq        (%%eax), %%mm2\n\t"            // mm2 = f2 f0 f2 f0
 447         "movq        %%mm1, %%mm5\n\t"              // mm5 = l3 l1 r3 r1
 448
 449         "punpcklwd   %%mm3, %%mm6\n\t"              // mm6 = l4 l2 r4 r2
 450         "pmaddwd     %%mm2, %%mm4\n\t"              // mm4 = l2*f2+l0*f0 r2*f2+r0*f0
 451
 452         "pmaddwd     %%mm2, %%mm5\n\t"              // mm5 = l3*f2+l1*f0 r3*f2+l1*f0
 453         "movq        8(%%eax), %%mm2\n\t"           // mm2 = f3 f1 f3 f1
 454
 455         "paddd       %%mm4, %%mm0\n\t"              // mm0 += s02*f02
 456         "movq        %%mm3, %%mm4\n\t"              // mm4 = l1 r1 l0 r0
 457
 458         "pmaddwd     %%mm2, %%mm1\n\t"              // mm1 = l3*f3+l1*f1 r3*f3+l1*f1
 459         "paddd       %%mm5, %%mm7\n\t"              // mm7 += s13*f02
 460
 461         "pmaddwd     %%mm2, %%mm6\n\t"              // mm6 = l4*f3+l2*f1 r4*f3+f4*f1
 462         "movq        24(%%esi), %%mm2\n\t"          // mm2 = l3 r3 l2 r2
 463
 464         "paddd       %%mm1, %%mm0\n\t"              // mm0 += s31*f31
 465         "movq        32(%%esi), %%mm1\n\t"          // mm1 = l5 r5 l4 r4
 466
 467         "paddd       %%mm6, %%mm7\n\t"              // mm7 += s42*f31
 468         "punpckhwd   %%mm2, %%mm3\n\t"              // mm3 = l3 l1 r3 r1
 469
 470         "movq        %%mm2, %%mm6\n\t"              // mm6 = l3 r3 l2 r2
 471         "punpcklwd   %%mm2, %%mm4\n\t"              // mm4 = l2 l0 r2 r0
 472
 473         "movq        16(%%eax), %%mm2\n\t"          // mm2 = f2 f0 f2 f0
 474         "movq        %%mm3, %%mm5\n\t"              // mm5 = l3 l1 r3 r1
 475
 476         "punpcklwd   %%mm1, %%mm6\n\t"              // mm6 = l4 l2 r4 r2
 477         "add         $32, %%eax\n\t"
 478
 479         "pmaddwd     %%mm2, %%mm4\n\t"              // mm4 = l2*f2+l0*f0 r2*f2+r0*f0
 480         "add         $32, %%esi\n\t"
 481
 482         "pmaddwd     %%mm2, %%mm5\n\t"              // mm5 = l3*f2+l1*f0 r3*f2+l1*f0
 483         "movq        -8(%%eax), %%mm2\n\t"          // mm2 = f3 f1 f3 f1
 484
 485         "paddd       %%mm4, %%mm0\n\t"              // mm0 += s02*f02
 486         "pmaddwd     %%mm2, %%mm3\n\t"              // mm3 = l3*f3+l1*f1 r3*f3+l1*f1
 487
 488         "paddd       %%mm5, %%mm7\n\t"              // mm7 += s13*f02
 489         "pmaddwd     %%mm2, %%mm6\n\t"              // mm6 = l4*f3+l2*f1 r4*f3+f4*f1
 490
 491         "paddd       %%mm3, %%mm0\n\t"              // mm0 += s31*f31
 492         "paddd       %%mm6, %%mm7\n\t"              // mm7 += s42*f31
 493
 494         "dec         %%ecx\n\t"
 495         "jnz         4b\n\t"
 496
 497         // Divide mm0 and mm7 by 8192 (= right-shift by 13),
 498         // pack and store to [edx]
 499         "movd        %6, %%mm4\n\t"
 500
 501         "psrad       %%mm4, %%mm0\n\t"              // divide the result
 502
 503         "add         $8, %%edx\n\t"
 504         "psrad       %%mm4, %%mm7\n\t"              // divide the result
 505
 506         "add         $8, %1\n\t"
 507         "packssdw    %%mm7, %%mm0\n\t"
 508
 509         "movq        %%mm0, -8(%%edx)\n\t"
 510         "dec         %%edi\n\t"
 511
 512         "jnz         3b\n\t"
 513
 514         "emms\n\t"
 515
 516       :
 517       : "rim" (numSamples),
 518         "rim" (local_src),
 519         "rim" (local_length),
 520         "rim" (dest),
 521         "rim" (local_lengthDiv8),
 522         "rim" (local_filterCoeffs),
 523         "rim" (local_resultDivider) /* input */
 524       : "%eax", "%ecx", "%edx", "%edi", "%esi" /* regs */
 525     );
 526     return (numSamples & 0xfffffffe) - local_length;
 527 #else
 528     throw runtime_error("MMX not supported");
 529     return 0;
 530 #endif
 531 }
 532 #endif
 533
 534 #endif // ALLOW_MMX