libs/soundtouch/mmx_win.cpp

   1 ////////////////////////////////////////////////////////////////////////////////
   2 ///
   3 /// Win32 version of the MMX optimized routines. All MMX optimized functions
   4 /// have been gathered into this single source code file, regardless to their
   5 /// class or original source code file, in order to ease porting the library
   6 /// to other compiler and processor platforms.
   7 ///
   8 /// This file is to be compiled in Windows platform with Microsoft Visual C++
   9 /// Compiler. Please see 'mmx_gcc.cpp' for the gcc compiler version for all
  10 /// GNU platforms.
  11 ///
  12 /// Author        : Copyright (c) Olli Parviainen
  13 /// Author e-mail : oparviai @ iki.fi
  14 /// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch
  15 ///
  16 ////////////////////////////////////////////////////////////////////////////////
  17 //
  18 // Last changed  : $Date$
  19 // File revision : $Revision$
  20 //
  21 // $Id$
  22 //
  23 ////////////////////////////////////////////////////////////////////////////////
  24 //
  25 // License :
  26 //
  27 //  SoundTouch audio processing library
  28 //  Copyright (c) Olli Parviainen
  29 //
  30 //  This library is free software; you can redistribute it and/or
  31 //  modify it under the terms of the GNU Lesser General Public
  32 //  License as published by the Free Software Foundation; either
  33 //  version 2.1 of the License, or (at your option) any later version.
  34 //
  35 //  This library is distributed in the hope that it will be useful,
  36 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  37 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  38 //  Lesser General Public License for more details.
  39 //
  40 //  You should have received a copy of the GNU Lesser General Public
  41 //  License along with this library; if not, write to the Free Software
  42 //  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  43 //
  44 ////////////////////////////////////////////////////////////////////////////////
  45
  46 #include "STTypes.h"
  47
  48 #ifndef WIN32
  49 #error "wrong platform - this source code file is exclusively for Win32 platform"
  50 #endif
  51
  52 using namespace soundtouch;
  53
  54 #ifdef ALLOW_MMX
  55 // MMX routines available only with integer sample type
  56
  57 //////////////////////////////////////////////////////////////////////////////
  58 //
  59 // implementation of MMX optimized functions of class 'TDStretchMMX'
  60 //
  61 //////////////////////////////////////////////////////////////////////////////
  62
  63 #include "TDStretch.h"
  64 #include <limits.h>
  65
  66 // these are declared in 'TDStretch.cpp'
  67 extern int scanOffsets[4][24];
  68
  69 // Calculates cross correlation of two buffers
  70 long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
  71 {
  72     long corr;
  73     uint local_overlapLength = overlapLength;
  74     uint local_overlapDividerBits = overlapDividerBits;
  75
  76     _asm
  77     {
  78         ; Calculate cross-correlation between the tempOffset and tmpbid_buffer.
  79         ;
  80         ; Process 4 parallel batches of 2 * stereo samples each during one
  81         ; round to improve CPU-level parallellization.
  82         ;
  83         ; load address of sloped pV2 buffer to eax
  84         ; load address of mixing point of the sample data buffer to ebx
  85         ; load counter to ecx = overlapLength / 8 - 1
  86         ; empty the mm0
  87         ;
  88         ; prepare to the first round by loading
  89         ; load mm1 = eax[0]
  90         ; load mm2 = eax[1];
  91
  92         mov         eax, dword ptr pV1
  93         mov         ebx, dword ptr pV2
  94
  95         movq        mm1, qword ptr [eax]
  96         mov         ecx, local_overlapLength
  97
  98         movq        mm2, qword ptr [eax+8]
  99         shr         ecx, 3
 100
 101         pxor        mm0, mm0
 102         sub         ecx, 1
 103
 104         movd        mm5, local_overlapDividerBits
 105
 106     loop1:
 107         ; multiply-add mm1 = mm1 * ebx[0]
 108         ; multiply-add mm2 = mm2 * ebx[1]
 109         ;
 110         ; add mm2 += mm1
 111         ; mm2 >>= mm5 (=overlapDividerBits)
 112         ; add mm0 += mm2
 113         ;
 114         ; load mm3 = eax[2]
 115         ; multiply-add mm3 = mm3 * ebx[2]
 116         ;
 117         ; load mm4 = eax[3]
 118         ; multiply-add mm4 = mm4 * ebx[3]
 119         ;
 120         ; add mm3 += mm4
 121         ; mm3 >>= mm5 (=overlapDividerBits)
 122         ; add mm0 += mm3
 123         ;
 124         ; add eax += 4;
 125         ; add ebx += 4
 126         ; load mm1 = eax[0] (~eax[4])
 127         ; load mm2 = eax[1] (~eax[5])
 128         ;
 129         ; loop
 130
 131         pmaddwd     mm1, qword ptr [ebx]
 132         movq        mm3, qword ptr [eax+16]
 133
 134         pmaddwd     mm2, qword ptr [ebx+8]
 135         movq        mm4, qword ptr [eax+24]
 136
 137         pmaddwd     mm3, qword ptr [ebx+16]
 138         paddd       mm2, mm1
 139
 140         pmaddwd     mm4, qword ptr [ebx+24]
 141         movq        mm1, qword ptr [eax+32]
 142
 143         psrad       mm2, mm5
 144         add         eax, 32
 145
 146         paddd       mm3, mm4
 147         paddd       mm0, mm2
 148
 149         movq        mm2, qword ptr [eax+8]
 150         psrad       mm3, mm5
 151
 152         add         ebx, 32
 153         paddd       mm0, mm3
 154
 155         dec         ecx
 156         jnz         loop1
 157
 158         ; Finalize the last partial loop:
 159
 160         movq        mm3, qword ptr [eax+16]
 161         pmaddwd     mm1, qword ptr [ebx]
 162
 163         movq        mm4, qword ptr [eax+24]
 164         pmaddwd     mm2, qword ptr [ebx+8]
 165
 166         pmaddwd     mm3, qword ptr [ebx+16]
 167         paddd       mm2, mm1
 168
 169         pmaddwd     mm4, qword ptr [ebx+24]
 170         psrad       mm2, mm5
 171
 172         paddd       mm3, mm4
 173         paddd       mm0, mm2
 174
 175         psrad       mm3, mm5
 176         paddd       mm0, mm3
 177
 178         ; copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
 179         ; and finally store the result into the variable "corr"
 180
 181         movq        mm1, mm0
 182         psrlq       mm1, 32
 183         paddd       mm0, mm1
 184         movd        corr, mm0
 185     }
 186     return corr;
 187
 188     // Note: Warning about the missing EMMS instruction is harmless
 189     // as it'll be called elsewhere.
 190 }
 191
 192
 193
 194 void TDStretchMMX::clearCrossCorrState()
 195 {
 196     _asm EMMS;
 197 }
 198
 199
 200
 201
 202
 203 // MMX-optimized version of the function overlapStereo
 204 void TDStretchMMX::overlapStereo(short *output, const short *input) const
 205 {
 206     short *local_midBuffer = pMidBuffer;
 207     uint local_overlapLength = overlapLength;
 208     uint local_overlapDividerBits = overlapDividerBits;
 209
 210     _asm
 211     {
 212         ; load sliding mixing value counter to mm6 and mm7
 213         ; load counter value to ecx = overlapLength / 4
 214         ; load divider-shifter value to esi
 215         ; load mixing value adder to mm5
 216         ; load address of midBuffer to eax
 217         ; load address of inputBuffer added with ovlOffset to ebx
 218         ; load address of end of the outputBuffer to edx
 219
 220         mov         eax, local_overlapLength        ; ecx = 0x0000 OVL_
 221         mov         edi, 0x0002fffe     ; ecx = 0x0002 fffe
 222
 223         mov            esi, local_overlapDividerBits
 224         movd        mm6, eax            ; mm6 = 0x0000 0000 0000 OVL_
 225
 226         mov         ecx, eax;
 227         sub         eax, 1
 228
 229         punpckldq   mm6, mm6            ; mm6 = 0x0000 OVL_ 0000 OVL_
 230         mov         edx, output
 231
 232         or          eax, 0x00010000     ; eax = 0x0001 overlapLength-1
 233         mov         ebx, dword ptr input
 234
 235         movd        mm5, edi            ; mm5 = 0x0000 0000 0002 fffe
 236         movd        mm7, eax            ; mm7 = 0x0000 0000 0001 01ff
 237
 238         mov         eax, dword ptr local_midBuffer
 239         punpckldq   mm5, mm5            ; mm5 = 0x0002 fffe 0002 fffe
 240
 241         shr         ecx, 2              ; ecx = overlapLength / 2
 242         punpckldq   mm7, mm7            ; mm7 = 0x0001 01ff 0001 01ff
 243
 244     loop1:
 245         ; Process two parallel batches of 2+2 stereo samples during each round
 246         ; to improve CPU-level parallellization.
 247         ;
 248         ; Load [eax] into mm0 and mm1
 249         ; Load [ebx] into mm3
 250         ; unpack words of mm0, mm1 and mm3 into mm0 and mm1
 251         ; multiply-add mm0*mm6 and mm1*mm7, store results into mm0 and mm1
 252         ; divide mm0 and mm1 by 512 (=right-shift by overlapDividerBits)
 253         ; pack the result into mm0 and store into [edx]
 254         ;
 255         ; Load [eax+8] into mm2 and mm3
 256         ; Load [ebx+8] into mm4
 257         ; unpack words of mm2, mm3 and mm4 into mm2 and mm3
 258         ; multiply-add mm2*mm6 and mm3*mm7, store results into mm2 and mm3
 259         ; divide mm2 and mm3 by 512 (=right-shift by overlapDividerBits)
 260         ; pack the result into mm2 and store into [edx+8]
 261
 262
 263         movq        mm0, qword ptr [eax]    ; mm0 = m1l m1r m0l m0r
 264         add         edx, 16
 265
 266         movq        mm3, qword ptr [ebx]    ; mm3 = i1l i1r i0l i0r
 267         movq        mm1, mm0                ; mm1 = m1l m1r m0l m0r
 268
 269         movq        mm2, qword ptr [eax+8]  ; mm2 = m3l m3r m2l m2r
 270         punpcklwd   mm0, mm3                ; mm0 = i0l m0l i0r m0r
 271
 272         movq        mm4, qword ptr [ebx+8]  ; mm4 = i3l i3r i2l i2r
 273         punpckhwd   mm1, mm3                ; mm1 = i1l m1l i1r m1r
 274
 275         movq        mm3, mm2                ; mm3 = m3l m3r m2l m2r
 276         punpcklwd   mm2, mm4                ; mm2 = i2l m2l i2r m2r
 277
 278         pmaddwd     mm0, mm6                ; mm0 = i0l*m63+m0l*m62 i0r*m61+m0r*m60
 279         punpckhwd   mm3, mm4                ; mm3 = i3l m3l i3r m3r
 280
 281         movd        mm4, esi                ; mm4 = overlapDividerBits
 282
 283         pmaddwd     mm1, mm7                ; mm1 = i1l*m73+m1l*m72 i1r*m71+m1r*m70
 284         paddw       mm6, mm5
 285
 286         paddw       mm7, mm5
 287         psrad       mm0, mm4                ; mmo >>= overlapDividerBits
 288
 289         pmaddwd     mm2, mm6                ; mm2 = i2l*m63+m2l*m62 i2r*m61+m2r*m60
 290         psrad       mm1, mm4                ; mm1 >>= overlapDividerBits
 291
 292         pmaddwd     mm3, mm7                ; mm3 = i3l*m73+m3l*m72 i3r*m71+m3r*m70
 293         psrad       mm2, mm4                ; mm2 >>= overlapDividerBits
 294
 295         packssdw    mm0, mm1                ; mm0 = mm1h mm1l mm0h mm0l
 296         psrad       mm3, mm4                ; mm3 >>= overlapDividerBits
 297
 298         add         eax, 16
 299         paddw       mm6, mm5
 300
 301         packssdw    mm2, mm3                ; mm2 = mm2h mm2l mm3h mm3l
 302         paddw       mm7, mm5
 303
 304         movq        qword ptr [edx-16], mm0
 305         add         ebx, 16
 306
 307         movq        qword ptr [edx-8], mm2
 308         dec         ecx
 309
 310         jnz         loop1
 311
 312         emms
 313     }
 314 }
 315
 316
 317 //////////////////////////////////////////////////////////////////////////////
 318 //
 319 // implementation of MMX optimized functions of class 'FIRFilter'
 320 //
 321 //////////////////////////////////////////////////////////////////////////////
 322
 323 #include "FIRFilter.h"
 324
 325
 326 FIRFilterMMX::FIRFilterMMX() : FIRFilter()
 327 {
 328     filterCoeffsUnalign = NULL;
 329 }
 330
 331
 332 FIRFilterMMX::~FIRFilterMMX()
 333 {
 334     delete[] filterCoeffsUnalign;
 335 }
 336
 337
 338 // (overloaded) Calculates filter coefficients for MMX routine
 339 void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
 340 {
 341     uint i;
 342     FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
 343
 344     // Ensure that filter coeffs array is aligned to 16-byte boundary
 345     delete[] filterCoeffsUnalign;
 346     filterCoeffsUnalign = new short[2 * newLength + 8];
 347     filterCoeffsAlign = (short *)(((uint)filterCoeffsUnalign + 15) & -16);
 348
 349     // rearrange the filter coefficients for mmx routines
 350     for (i = 0;i < length; i += 4)
 351     {
 352         filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
 353         filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
 354         filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
 355         filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];
 356
 357         filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
 358         filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
 359         filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
 360         filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
 361     }
 362 }
 363
 364
 365
 366 // mmx-optimized version of the filter routine for stereo sound
 367 uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uint numSamples) const
 368 {
 369     // Create stack copies of the needed member variables for asm routines :
 370     uint local_length = length;
 371     uint local_lengthDiv8 = lengthDiv8;
 372     uint local_resultDivider = resultDivFactor;
 373     short *local_filterCoeffs = (short*)filterCoeffsAlign;
 374
 375     if (local_length < 2) return 0;
 376
 377     _asm
 378     {
 379         ; Load (num_samples-aa_filter_length)/2 to edi as a i
 380         ; Load a pointer to samples to esi
 381         ; Load a pointer to destination to edx
 382
 383         mov         edi, numSamples
 384         mov         esi, dword ptr src
 385         sub         edi, local_length
 386         mov         edx, dword ptr dest
 387         sar         edi, 1
 388
 389         ; Load filter length/8 to ecx
 390         ; Load pointer to samples from esi to ebx
 391         ; Load counter from edi to ecx
 392         ; Load [ebx] to mm3
 393         ; Load pointer to filter coefficients to eax
 394 loop1:
 395         mov         ebx, esi
 396         pxor        mm0, mm0
 397
 398         mov         ecx, local_lengthDiv8
 399         pxor        mm7, mm7
 400
 401         movq        mm1, [ebx]              ; mm1 = l1 r1 l0 r0
 402         mov         eax, local_filterCoeffs
 403 loop2:
 404
 405         movq        mm2, [ebx+8]            ; mm2 = l3 r3 l2 r2
 406         movq        mm4, mm1                ; mm4 = l1 r1 l0 r0
 407
 408         movq        mm3, [ebx+16]           ; mm3 = l5 r5 l4 r4
 409         punpckhwd   mm1, mm2                ; mm1 = l3 l1 r3 r1
 410
 411         movq        mm6, mm2                ; mm6 = l3 r3 l2 r2
 412         punpcklwd   mm4, mm2                ; mm4 = l2 l0 r2 r0
 413
 414         movq        mm2, qword ptr [eax]    ; mm2 = f2 f0 f2 f0
 415         movq        mm5, mm1                ; mm5 = l3 l1 r3 r1
 416
 417         punpcklwd   mm6, mm3                ; mm6 = l4 l2 r4 r2
 418         pmaddwd     mm4, mm2                ; mm4 = l2*f2+l0*f0 r2*f2+r0*f0
 419
 420         pmaddwd     mm5, mm2                ; mm5 = l3*f2+l1*f0 r3*f2+l1*f0
 421         movq        mm2, qword ptr [eax+8]  ; mm2 = f3 f1 f3 f1
 422
 423         paddd       mm0, mm4                ; mm0 += s02*f02
 424         movq        mm4, mm3                ; mm4 = l1 r1 l0 r0
 425
 426         pmaddwd     mm1, mm2                ; mm1 = l3*f3+l1*f1 r3*f3+l1*f1
 427         paddd       mm7, mm5                ; mm7 += s13*f02
 428
 429         pmaddwd     mm6, mm2                ; mm6 = l4*f3+l2*f1 r4*f3+f4*f1
 430         movq        mm2, [ebx+24]           ; mm2 = l3 r3 l2 r2
 431
 432         paddd       mm0, mm1                ; mm0 += s31*f31
 433         movq        mm1, [ebx+32]           ; mm1 = l5 r5 l4 r4
 434
 435         paddd       mm7, mm6                ; mm7 += s42*f31
 436         punpckhwd   mm3, mm2                ; mm3 = l3 l1 r3 r1
 437
 438         movq        mm6, mm2                ; mm6 = l3 r3 l2 r2
 439         punpcklwd   mm4, mm2                ; mm4 = l2 l0 r2 r0
 440
 441         movq        mm2, qword ptr [eax+16] ; mm2 = f2 f0 f2 f0
 442         movq        mm5, mm3                ; mm5 = l3 l1 r3 r1
 443
 444         punpcklwd   mm6, mm1                ; mm6 = l4 l2 r4 r2
 445         add         eax, 32
 446
 447         pmaddwd     mm4, mm2                ; mm4 = l2*f2+l0*f0 r2*f2+r0*f0
 448         add         ebx, 32
 449
 450         pmaddwd     mm5, mm2                ; mm5 = l3*f2+l1*f0 r3*f2+l1*f0
 451         movq        mm2, qword ptr [eax-8]  ; mm2 = f3 f1 f3 f1
 452
 453         paddd       mm0, mm4                ; mm0 += s02*f02
 454         pmaddwd     mm3, mm2                ; mm3 = l3*f3+l1*f1 r3*f3+l1*f1
 455
 456         paddd       mm7, mm5                ; mm7 += s13*f02
 457         pmaddwd     mm6, mm2                ; mm6 = l4*f3+l2*f1 r4*f3+f4*f1
 458
 459         paddd       mm0, mm3                ; mm0 += s31*f31
 460         paddd       mm7, mm6                ; mm7 += s42*f31
 461
 462         dec         ecx
 463         jnz         loop2
 464
 465         ; Divide mm0 and mm7 by 8192 (= right-shift by 13),
 466         ; pack and store to [edx]
 467         movd        mm4, local_resultDivider;
 468
 469         psrad       mm0, mm4                ; divider the result
 470
 471         add         edx, 8
 472         psrad       mm7, mm4                ; divider the result
 473
 474         add         esi, 8
 475         packssdw    mm0, mm7
 476
 477         movq        qword ptr [edx-8], mm0
 478         dec         edi
 479
 480         jnz         loop1
 481
 482         emms
 483     }
 484     return (numSamples & 0xfffffffe) - local_length;
 485 }
 486
 487 #endif  // ALLOW_MMX