2 Copyright (C) 2005 Paul Davis
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain);
24 .globl x86_sse_mix_buffers_with_gain
25 .type x86_sse_mix_buffers_with_gain,@function
27 x86_sse_mix_buffers_with_gain:
28 #; 8(%ebp) = float *dst = %edi
29 #; 12(%ebp) = float *src = %esi
30 #; 16(%ebp) = long nframes = %ecx
31 #; 20(%ebp) = float gain = st(0)
43 #; if nframes == 0, go to end
44 movl 16(%ebp), %ecx #; nframes
48 #; Check for alignment
50 movl 8(%ebp), %edi #; dst
51 movl 12(%ebp), %esi #; src
54 andl $12, %eax #; mask alignemnt offset
57 andl $12, %ebx #; mask alignment offset
60 jne .MBWG_NONALIGN #; if not aligned, calculate manually
66 #; Pre-loop, we need to run 1-3 frames "manually" without
69 movss 20(%ebp), %xmm1 #; xmm1
78 addl $4, %edi #; dst++
79 addl $4, %esi #; src++
80 decl %ecx #; nframes--
84 #; je .MBWG_END #; if we run out of frames, go to end
88 cmp $16, %ebx #; test if we've reached 16 byte alignment
94 cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
95 jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
97 #; copy gain to fill %xmm1
99 shufps $0x00, %xmm1, %xmm1
104 movaps (%esi), %xmm0 #; source => xmm0
105 mulps %xmm1, %xmm0 #; apply gain to source
106 addps (%edi), %xmm0 #; mix with destination
107 movaps %xmm0, (%edi) #; copy result to destination
109 addl $16, %edi #; dst+=4
110 addl $16, %esi #; src+=4
112 subl $4, %ecx #; nframes-=4
119 #; if there are remaining frames, the nonalign code will do nicely
120 #; for the rest 1-3 frames.
125 movss 20(%ebp), %xmm1 #; gain => xmm1
138 jnz .MBWG_NONALIGNLOOP
152 .size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
157 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
159 .globl x86_sse_mix_buffers_no_gain
160 .type x86_sse_mix_buffers_no_gain,@function
162 x86_sse_mix_buffers_no_gain:
163 #; 8(%ebp) = float *dst = %edi
164 #; 12(%ebp) = float *src = %esi
165 #; 16(%ebp) = long nframes = %ecx
170 #; save the registers
179 #; if nframes == 0, go to end
180 movl 16(%ebp), %ecx #; nframes
184 #; Check for alignment
186 movl 8(%ebp), %edi #; dst
187 movl 12(%ebp), %esi #; src
190 andl $12, %eax #; mask alignemnt offset
193 andl $12, %ebx #; mask alignment offset
196 jne .MBNG_NONALIGN #; if not aligned, calculate manually
201 #; Pre-loop, we need to run 1-3 frames "manually" without
210 addl $4, %edi #; dst++
211 addl $4, %esi #; src++
212 decl %ecx #; nframes--
216 cmp $16, %ebx #; test if we've reached 16 byte alignment
221 cmp $4, %ecx #; if there are frames left, but less than 4
222 jnge .MBNG_NONALIGN #; we can't run SSE
226 movaps (%esi), %xmm0 #; source => xmm0
227 addps (%edi), %xmm0 #; mix with destination
228 movaps %xmm0, (%edi) #; copy result to destination
230 addl $16, %edi #; dst+=4
231 addl $16, %esi #; src+=4
233 subl $4, %ecx #; nframes-=4
240 #; if there are remaining frames, the nonalign code will do nicely
241 #; for the rest 1-3 frames.
246 movss (%esi), %xmm0 #; src => xmm0
247 addss (%edi), %xmm0 #; xmm0 += dst
248 movss %xmm0, (%edi) #; xmm0 => dst
268 .size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
273 #; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
275 .globl x86_sse_apply_gain_to_buffer
276 .type x86_sse_apply_gain_to_buffer,@function
278 x86_sse_apply_gain_to_buffer:
279 #; 8(%ebp) = float *buf = %edi
280 #; 12(%ebp) = long nframes = %ecx
281 #; 16(%ebp) = float gain = st(0)
291 #; if nframes == 0, go to end
292 movl 12(%ebp), %ecx #; nframes
296 #; create the gain buffer in %xmm1
297 movss 16(%ebp), %xmm1
298 shufps $0x00, %xmm1, %xmm1
300 #; Check for alignment
302 movl 8(%ebp), %edi #; buf
303 movl %edi, %edx #; buf => %edx
304 andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
305 jz .AG_SSE #; if buffer IS aligned
308 #; we iterate 1-3 times, doing normal x87 float comparison
309 #; so we reach a 16 byte aligned "buf" (=%edi) value
313 #; Load next value from the buffer
318 #; increment buffer, decrement counter
319 addl $4, %edi #; buf++;
321 decl %ecx #; nframes--
322 jz .AG_END #; if we run out of frames, we go to the end
324 addl $4, %edx #; one non-aligned byte less
326 jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
330 #; We have reached the 16 byte aligned "buf" ("edi") value
332 #; Figure out how many loops we should do
333 movl %ecx, %eax #; copy remaining nframes to %eax for division
334 movl $0, %edx #; 0 the edx register
339 divl %edi #; %edx = remainder == 0
342 #; %eax = SSE iterations
354 #; subl $4, %ecx #; nframes-=4
359 #; Next we need to post-process all remaining frames
360 #; the remaining frame count is in %ecx
362 #; if no remaining frames, jump to the end
364 andl $3, %ecx #; nframes % 4
373 #; increment buffer, decrement counter
374 addl $4, %edi #; buf++;
376 decl %ecx #; nframes--
377 jnz .AGPOST_START #; if we run out of frames, we go to the end
388 .size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
393 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
395 .globl x86_sse_compute_peak
396 .type x86_sse_compute_peak,@function
402 x86_sse_compute_peak:
403 #; 8(%ebp) = float *buf = %edi
404 #; 12(%ebp) = long nframes = %ecx
405 #; 16(%ebp) = float current = st(0)
415 #; Load "current" in xmm0
416 movss 16(%ebp), %xmm0
418 #; if nframes == 0, go to end
419 movl 12(%ebp), %ecx #; nframes
423 #; create the "abs" mask in %xmm2
424 movss abs_mask, %xmm2
425 shufps $0x00, %xmm2, %xmm2
427 #; Check for alignment
429 movl 8(%ebp), %edi #; buf
430 movl %edi, %edx #; buf => %edx
431 andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
432 jz .CP_SSE #; if buffer IS aligned
435 #; we iterate 1-3 times, doing normal x87 float comparison
436 #; so we reach a 16 byte aligned "buf" (=%edi) value
440 #; Load next value from the buffer
445 #; increment buffer, decrement counter
446 addl $4, %edi #; buf++;
448 decl %ecx #; nframes--
449 jz .CP_END #; if we run out of frames, we go to the end
451 addl $4, %edx #; one non-aligned byte less
453 jne .LP_START #; if more non-aligned frames exist, we do a do-over
457 #; We have reached the 16 byte aligned "buf" ("edi") value
459 #; Figure out how many loops we should do
460 movl %ecx, %eax #; copy remaining nframes to %eax for division
462 shr $2,%eax #; unsigned divide by 4
465 #; %eax = SSE iterations
467 #; current maximum is at %xmm0, but we need to ..
468 shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
470 #;prefetcht0 16(%edi)
483 #; Calculate the maximum value contained in the 4 FP's in %xmm0
485 shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
486 maxps %xmm1, %xmm0 #; maximums of the two pairs
488 shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
491 #; now every float in %xmm0 is the same value, current maximum value
493 #; Next we need to post-process all remaining frames
494 #; the remaining frame count is in %ecx
496 #; if no remaining frames, jump to the end
498 andl $3, %ecx #; nframes % 4
507 addl $4, %edi #; buf++;
509 decl %ecx #; nframes--;
514 #; Load the value from xmm0 to the float stack for returning
515 movss %xmm0, 16(%ebp)
524 .size x86_sse_compute_peak, .-x86_sse_compute_peak