1 ////////////////////////////////////////////////////////////////////////////////
3 /// Win32 version of the MMX optimized routines. All MMX optimized functions
4 /// have been gathered into this single source code file, regardless to their
5 /// class or original source code file, in order to ease porting the library
6 /// to other compiler and processor platforms.
8 /// This file is to be compiled in Windows platform with Microsoft Visual C++
9 /// Compiler. Please see 'mmx_gcc.cpp' for the gcc compiler version for all
12 /// Author : Copyright (c) Olli Parviainen
13 /// Author e-mail : oparviai @ iki.fi
14 /// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch
16 ////////////////////////////////////////////////////////////////////////////////
18 // Last changed : $Date$
19 // File revision : $Revision$
23 ////////////////////////////////////////////////////////////////////////////////
27 // SoundTouch audio processing library
28 // Copyright (c) Olli Parviainen
30 // This library is free software; you can redistribute it and/or
31 // modify it under the terms of the GNU Lesser General Public
32 // License as published by the Free Software Foundation; either
33 // version 2.1 of the License, or (at your option) any later version.
35 // This library is distributed in the hope that it will be useful,
36 // but WITHOUT ANY WARRANTY; without even the implied warranty of
37 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 // Lesser General Public License for more details.
40 // You should have received a copy of the GNU Lesser General Public
41 // License along with this library; if not, write to the Free Software
42 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
44 ////////////////////////////////////////////////////////////////////////////////
49 #error "wrong platform - this source code file is exclusively for Win32 platform"
52 using namespace soundtouch;
55 // MMX routines available only with integer sample type
57 //////////////////////////////////////////////////////////////////////////////
59 // implementation of MMX optimized functions of class 'TDStretchMMX'
61 //////////////////////////////////////////////////////////////////////////////
63 #include "TDStretch.h"
66 // these are declared in 'TDStretch.cpp'
67 extern int scanOffsets[4][24];
69 // Calculates cross correlation of two buffers
70 long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
73 uint local_overlapLength = overlapLength;
74 uint local_overlapDividerBits = overlapDividerBits;
78 ; Calculate cross-correlation between the tempOffset and tmpbid_buffer.
80 ; Process 4 parallel batches of 2 * stereo samples each during one
81 ; round to improve CPU-level parallellization.
83 ; load address of sloped pV2 buffer to eax
84 ; load address of mixing point of the sample data buffer to ebx
85 ; load counter to ecx = overlapLength / 8 - 1
88 ; prepare to the first round by loading
92 mov eax, dword ptr pV1
93 mov ebx, dword ptr pV2
95 movq mm1, qword ptr [eax]
96 mov ecx, local_overlapLength
98 movq mm2, qword ptr [eax+8]
104 movd mm5, local_overlapDividerBits
107 ; multiply-add mm1 = mm1 * ebx[0]
108 ; multiply-add mm2 = mm2 * ebx[1]
111 ; mm2 >>= mm5 (=overlapDividerBits)
115 ; multiply-add mm3 = mm3 * ebx[2]
118 ; multiply-add mm4 = mm4 * ebx[3]
121 ; mm3 >>= mm5 (=overlapDividerBits)
126 ; load mm1 = eax[0] (~eax[4])
127 ; load mm2 = eax[1] (~eax[5])
131 pmaddwd mm1, qword ptr [ebx]
132 movq mm3, qword ptr [eax+16]
134 pmaddwd mm2, qword ptr [ebx+8]
135 movq mm4, qword ptr [eax+24]
137 pmaddwd mm3, qword ptr [ebx+16]
140 pmaddwd mm4, qword ptr [ebx+24]
141 movq mm1, qword ptr [eax+32]
149 movq mm2, qword ptr [eax+8]
158 ; Finalize the last partial loop:
160 movq mm3, qword ptr [eax+16]
161 pmaddwd mm1, qword ptr [ebx]
163 movq mm4, qword ptr [eax+24]
164 pmaddwd mm2, qword ptr [ebx+8]
166 pmaddwd mm3, qword ptr [ebx+16]
169 pmaddwd mm4, qword ptr [ebx+24]
178 ; copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
179 ; and finally store the result into the variable "corr"
188 // Note: Warning about the missing EMMS instruction is harmless
189 // as it'll be called elsewhere.
194 void TDStretchMMX::clearCrossCorrState()
203 // MMX-optimized version of the function overlapStereo
204 void TDStretchMMX::overlapStereo(short *output, const short *input) const
206 short *local_midBuffer = pMidBuffer;
207 uint local_overlapLength = overlapLength;
208 uint local_overlapDividerBits = overlapDividerBits;
212 ; load sliding mixing value counter to mm6 and mm7
213 ; load counter value to ecx = overlapLength / 4
214 ; load divider-shifter value to esi
215 ; load mixing value adder to mm5
216 ; load address of midBuffer to eax
217 ; load address of inputBuffer added with ovlOffset to ebx
218 ; load address of end of the outputBuffer to edx
220 mov eax, local_overlapLength ; ecx = 0x0000 OVL_
221 mov edi, 0x0002fffe ; ecx = 0x0002 fffe
223 mov esi, local_overlapDividerBits
224 movd mm6, eax ; mm6 = 0x0000 0000 0000 OVL_
229 punpckldq mm6, mm6 ; mm6 = 0x0000 OVL_ 0000 OVL_
232 or eax, 0x00010000 ; eax = 0x0001 overlapLength-1
233 mov ebx, dword ptr input
235 movd mm5, edi ; mm5 = 0x0000 0000 0002 fffe
236 movd mm7, eax ; mm7 = 0x0000 0000 0001 01ff
238 mov eax, dword ptr local_midBuffer
239 punpckldq mm5, mm5 ; mm5 = 0x0002 fffe 0002 fffe
241 shr ecx, 2 ; ecx = overlapLength / 2
242 punpckldq mm7, mm7 ; mm7 = 0x0001 01ff 0001 01ff
245 ; Process two parallel batches of 2+2 stereo samples during each round
246 ; to improve CPU-level parallellization.
248 ; Load [eax] into mm0 and mm1
249 ; Load [ebx] into mm3
250 ; unpack words of mm0, mm1 and mm3 into mm0 and mm1
251 ; multiply-add mm0*mm6 and mm1*mm7, store results into mm0 and mm1
252 ; divide mm0 and mm1 by 512 (=right-shift by overlapDividerBits)
253 ; pack the result into mm0 and store into [edx]
255 ; Load [eax+8] into mm2 and mm3
256 ; Load [ebx+8] into mm4
257 ; unpack words of mm2, mm3 and mm4 into mm2 and mm3
258 ; multiply-add mm2*mm6 and mm3*mm7, store results into mm2 and mm3
259 ; divide mm2 and mm3 by 512 (=right-shift by overlapDividerBits)
260 ; pack the result into mm2 and store into [edx+8]
263 movq mm0, qword ptr [eax] ; mm0 = m1l m1r m0l m0r
266 movq mm3, qword ptr [ebx] ; mm3 = i1l i1r i0l i0r
267 movq mm1, mm0 ; mm1 = m1l m1r m0l m0r
269 movq mm2, qword ptr [eax+8] ; mm2 = m3l m3r m2l m2r
270 punpcklwd mm0, mm3 ; mm0 = i0l m0l i0r m0r
272 movq mm4, qword ptr [ebx+8] ; mm4 = i3l i3r i2l i2r
273 punpckhwd mm1, mm3 ; mm1 = i1l m1l i1r m1r
275 movq mm3, mm2 ; mm3 = m3l m3r m2l m2r
276 punpcklwd mm2, mm4 ; mm2 = i2l m2l i2r m2r
278 pmaddwd mm0, mm6 ; mm0 = i0l*m63+m0l*m62 i0r*m61+m0r*m60
279 punpckhwd mm3, mm4 ; mm3 = i3l m3l i3r m3r
281 movd mm4, esi ; mm4 = overlapDividerBits
283 pmaddwd mm1, mm7 ; mm1 = i1l*m73+m1l*m72 i1r*m71+m1r*m70
287 psrad mm0, mm4 ; mmo >>= overlapDividerBits
289 pmaddwd mm2, mm6 ; mm2 = i2l*m63+m2l*m62 i2r*m61+m2r*m60
290 psrad mm1, mm4 ; mm1 >>= overlapDividerBits
292 pmaddwd mm3, mm7 ; mm3 = i3l*m73+m3l*m72 i3r*m71+m3r*m70
293 psrad mm2, mm4 ; mm2 >>= overlapDividerBits
295 packssdw mm0, mm1 ; mm0 = mm1h mm1l mm0h mm0l
296 psrad mm3, mm4 ; mm3 >>= overlapDividerBits
301 packssdw mm2, mm3 ; mm2 = mm2h mm2l mm3h mm3l
304 movq qword ptr [edx-16], mm0
307 movq qword ptr [edx-8], mm2
317 //////////////////////////////////////////////////////////////////////////////
319 // implementation of MMX optimized functions of class 'FIRFilter'
321 //////////////////////////////////////////////////////////////////////////////
323 #include "FIRFilter.h"
326 FIRFilterMMX::FIRFilterMMX() : FIRFilter()
328 filterCoeffsUnalign = NULL;
332 FIRFilterMMX::~FIRFilterMMX()
334 delete[] filterCoeffsUnalign;
338 // (overloaded) Calculates filter coefficients for MMX routine
339 void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
342 FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
344 // Ensure that filter coeffs array is aligned to 16-byte boundary
345 delete[] filterCoeffsUnalign;
346 filterCoeffsUnalign = new short[2 * newLength + 8];
347 filterCoeffsAlign = (short *)(((uint)filterCoeffsUnalign + 15) & -16);
349 // rearrange the filter coefficients for mmx routines
350 for (i = 0;i < length; i += 4)
352 filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
353 filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
354 filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
355 filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];
357 filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
358 filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
359 filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
360 filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
366 // mmx-optimized version of the filter routine for stereo sound
367 uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uint numSamples) const
369 // Create stack copies of the needed member variables for asm routines :
370 uint local_length = length;
371 uint local_lengthDiv8 = lengthDiv8;
372 uint local_resultDivider = resultDivFactor;
373 short *local_filterCoeffs = (short*)filterCoeffsAlign;
375 if (local_length < 2) return 0;
379 ; Load (num_samples-aa_filter_length)/2 to edi as a i
380 ; Load a pointer to samples to esi
381 ; Load a pointer to destination to edx
384 mov esi, dword ptr src
385 sub edi, local_length
386 mov edx, dword ptr dest
389 ; Load filter length/8 to ecx
390 ; Load pointer to samples from esi to ebx
391 ; Load counter from edi to ecx
393 ; Load pointer to filter coefficients to eax
398 mov ecx, local_lengthDiv8
401 movq mm1, [ebx] ; mm1 = l1 r1 l0 r0
402 mov eax, local_filterCoeffs
405 movq mm2, [ebx+8] ; mm2 = l3 r3 l2 r2
406 movq mm4, mm1 ; mm4 = l1 r1 l0 r0
408 movq mm3, [ebx+16] ; mm3 = l5 r5 l4 r4
409 punpckhwd mm1, mm2 ; mm1 = l3 l1 r3 r1
411 movq mm6, mm2 ; mm6 = l3 r3 l2 r2
412 punpcklwd mm4, mm2 ; mm4 = l2 l0 r2 r0
414 movq mm2, qword ptr [eax] ; mm2 = f2 f0 f2 f0
415 movq mm5, mm1 ; mm5 = l3 l1 r3 r1
417 punpcklwd mm6, mm3 ; mm6 = l4 l2 r4 r2
418 pmaddwd mm4, mm2 ; mm4 = l2*f2+l0*f0 r2*f2+r0*f0
420 pmaddwd mm5, mm2 ; mm5 = l3*f2+l1*f0 r3*f2+l1*f0
421 movq mm2, qword ptr [eax+8] ; mm2 = f3 f1 f3 f1
423 paddd mm0, mm4 ; mm0 += s02*f02
424 movq mm4, mm3 ; mm4 = l1 r1 l0 r0
426 pmaddwd mm1, mm2 ; mm1 = l3*f3+l1*f1 r3*f3+l1*f1
427 paddd mm7, mm5 ; mm7 += s13*f02
429 pmaddwd mm6, mm2 ; mm6 = l4*f3+l2*f1 r4*f3+f4*f1
430 movq mm2, [ebx+24] ; mm2 = l3 r3 l2 r2
432 paddd mm0, mm1 ; mm0 += s31*f31
433 movq mm1, [ebx+32] ; mm1 = l5 r5 l4 r4
435 paddd mm7, mm6 ; mm7 += s42*f31
436 punpckhwd mm3, mm2 ; mm3 = l3 l1 r3 r1
438 movq mm6, mm2 ; mm6 = l3 r3 l2 r2
439 punpcklwd mm4, mm2 ; mm4 = l2 l0 r2 r0
441 movq mm2, qword ptr [eax+16] ; mm2 = f2 f0 f2 f0
442 movq mm5, mm3 ; mm5 = l3 l1 r3 r1
444 punpcklwd mm6, mm1 ; mm6 = l4 l2 r4 r2
447 pmaddwd mm4, mm2 ; mm4 = l2*f2+l0*f0 r2*f2+r0*f0
450 pmaddwd mm5, mm2 ; mm5 = l3*f2+l1*f0 r3*f2+l1*f0
451 movq mm2, qword ptr [eax-8] ; mm2 = f3 f1 f3 f1
453 paddd mm0, mm4 ; mm0 += s02*f02
454 pmaddwd mm3, mm2 ; mm3 = l3*f3+l1*f1 r3*f3+l1*f1
456 paddd mm7, mm5 ; mm7 += s13*f02
457 pmaddwd mm6, mm2 ; mm6 = l4*f3+l2*f1 r4*f3+f4*f1
459 paddd mm0, mm3 ; mm0 += s31*f31
460 paddd mm7, mm6 ; mm7 += s42*f31
465 ; Divide mm0 and mm7 by 8192 (= right-shift by 13),
466 ; pack and store to [edx]
467 movd mm4, local_resultDivider;
469 psrad mm0, mm4 ; divider the result
472 psrad mm7, mm4 ; divider the result
477 movq qword ptr [edx-8], mm0
484 return (numSamples & 0xfffffffe) - local_length;