2 Copyright (C) 2005 Paul Davis
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 Author: Sampo Savolainen
24 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain);
26 .globl x86_sse_mix_buffers_with_gain
27 .type x86_sse_mix_buffers_with_gain,@function
29 x86_sse_mix_buffers_with_gain:
30 #; 8(%ebp) = float *dst = %edi
31 #; 12(%ebp) = float *src = %esi
32 #; 16(%ebp) = long nframes = %ecx
33 #; 20(%ebp) = float gain = st(0)
45 #; if nframes == 0, go to end
46 movl 16(%ebp), %ecx #; nframes
50 #; Check for alignment
52 movl 8(%ebp), %edi #; dst
53 movl 12(%ebp), %esi #; src
56 andl $12, %eax #; mask alignemnt offset
59 andl $12, %ebx #; mask alignment offset
62 jne .MBWG_NONALIGN #; if not aligned, calculate manually
68 #; Pre-loop, we need to run 1-3 frames "manually" without
71 movss 20(%ebp), %xmm1 #; xmm1
80 addl $4, %edi #; dst++
81 addl $4, %esi #; src++
82 decl %ecx #; nframes--
86 #; je .MBWG_END #; if we run out of frames, go to end
90 cmp $16, %ebx #; test if we've reached 16 byte alignment
96 cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
97 jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
99 #; copy gain to fill %xmm1
100 movss 20(%ebp), %xmm1
101 shufps $0x00, %xmm1, %xmm1
106 movaps (%esi), %xmm0 #; source => xmm0
107 mulps %xmm1, %xmm0 #; apply gain to source
108 addps (%edi), %xmm0 #; mix with destination
109 movaps %xmm0, (%edi) #; copy result to destination
111 addl $16, %edi #; dst+=4
112 addl $16, %esi #; src+=4
114 subl $4, %ecx #; nframes-=4
121 #; if there are remaining frames, the nonalign code will do nicely
122 #; for the rest 1-3 frames.
127 movss 20(%ebp), %xmm1 #; gain => xmm1
140 jnz .MBWG_NONALIGNLOOP
154 .size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
159 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
161 .globl x86_sse_mix_buffers_no_gain
162 .type x86_sse_mix_buffers_no_gain,@function
164 x86_sse_mix_buffers_no_gain:
165 #; 8(%ebp) = float *dst = %edi
166 #; 12(%ebp) = float *src = %esi
167 #; 16(%ebp) = long nframes = %ecx
172 #; save the registers
181 #; if nframes == 0, go to end
182 movl 16(%ebp), %ecx #; nframes
186 #; Check for alignment
188 movl 8(%ebp), %edi #; dst
189 movl 12(%ebp), %esi #; src
192 andl $12, %eax #; mask alignemnt offset
195 andl $12, %ebx #; mask alignment offset
198 jne .MBNG_NONALIGN #; if not aligned, calculate manually
203 #; Pre-loop, we need to run 1-3 frames "manually" without
212 addl $4, %edi #; dst++
213 addl $4, %esi #; src++
214 decl %ecx #; nframes--
218 cmp $16, %ebx #; test if we've reached 16 byte alignment
223 cmp $4, %ecx #; if there are frames left, but less than 4
224 jnge .MBNG_NONALIGN #; we can't run SSE
228 movaps (%esi), %xmm0 #; source => xmm0
229 addps (%edi), %xmm0 #; mix with destination
230 movaps %xmm0, (%edi) #; copy result to destination
232 addl $16, %edi #; dst+=4
233 addl $16, %esi #; src+=4
235 subl $4, %ecx #; nframes-=4
242 #; if there are remaining frames, the nonalign code will do nicely
243 #; for the rest 1-3 frames.
248 movss (%esi), %xmm0 #; src => xmm0
249 addss (%edi), %xmm0 #; xmm0 += dst
250 movss %xmm0, (%edi) #; xmm0 => dst
270 .size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
275 #; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
277 .globl x86_sse_apply_gain_to_buffer
278 .type x86_sse_apply_gain_to_buffer,@function
280 x86_sse_apply_gain_to_buffer:
281 #; 8(%ebp) = float *buf = %edi
282 #; 12(%ebp) = long nframes = %ecx
283 #; 16(%ebp) = float gain = st(0)
293 #; if nframes == 0, go to end
294 movl 12(%ebp), %ecx #; nframes
298 #; create the gain buffer in %xmm1
299 movss 16(%ebp), %xmm1
300 shufps $0x00, %xmm1, %xmm1
302 #; Check for alignment
304 movl 8(%ebp), %edi #; buf
305 movl %edi, %edx #; buf => %edx
306 andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
307 jz .AG_SSE #; if buffer IS aligned
310 #; we iterate 1-3 times, doing normal x87 float comparison
311 #; so we reach a 16 byte aligned "buf" (=%edi) value
315 #; Load next value from the buffer
320 #; increment buffer, decrement counter
321 addl $4, %edi #; buf++;
323 decl %ecx #; nframes--
324 jz .AG_END #; if we run out of frames, we go to the end
326 addl $4, %edx #; one non-aligned byte less
328 jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
332 #; We have reached the 16 byte aligned "buf" ("edi") value
334 #; Figure out how many loops we should do
335 movl %ecx, %eax #; copy remaining nframes to %eax for division
336 movl $0, %edx #; 0 the edx register
341 divl %edi #; %edx = remainder == 0
344 #; %eax = SSE iterations
356 #; subl $4, %ecx #; nframes-=4
361 #; Next we need to post-process all remaining frames
362 #; the remaining frame count is in %ecx
364 #; if no remaining frames, jump to the end
366 andl $3, %ecx #; nframes % 4
375 #; increment buffer, decrement counter
376 addl $4, %edi #; buf++;
378 decl %ecx #; nframes--
379 jnz .AGPOST_START #; if we run out of frames, we go to the end
390 .size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
395 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
397 .globl x86_sse_compute_peak
398 .type x86_sse_compute_peak,@function
400 x86_sse_compute_peak:
401 #; 8(%ebp) = float *buf = %edi
402 #; 12(%ebp) = long nframes = %ecx
403 #; 16(%ebp) = float current = st(0)
413 #; Load "current" in xmm0
414 movss 16(%ebp), %xmm0
416 #; if nframes == 0, go to end
417 movl 12(%ebp), %ecx #; nframes
421 #; create the "abs" mask in %xmm2
425 shufps $0x00, %xmm2, %xmm2
427 #; Check for alignment
429 movl 8(%ebp), %edi #; buf
430 movl %edi, %edx #; buf => %edx
431 andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
432 jz .CP_SSE #; if buffer IS aligned
435 #; we iterate 1-3 times, doing normal x87 float comparison
436 #; so we reach a 16 byte aligned "buf" (=%edi) value
440 #; Load next value from the buffer
445 #; increment buffer, decrement counter
446 addl $4, %edi #; buf++;
448 decl %ecx #; nframes--
449 jz .CP_END #; if we run out of frames, we go to the end
451 addl $4, %edx #; one non-aligned byte less
453 jne .LP_START #; if more non-aligned frames exist, we do a do-over
457 #; We have reached the 16 byte aligned "buf" ("edi") value
459 #; Figure out how many loops we should do
460 movl %ecx, %eax #; copy remaining nframes to %eax for division
462 shr $2,%eax #; unsigned divide by 4
465 #; %eax = SSE iterations
467 #; current maximum is at %xmm0, but we need to ..
468 shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
470 #;prefetcht0 16(%edi)
483 #; Calculate the maximum value contained in the 4 FP's in %xmm0
485 shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
486 maxps %xmm1, %xmm0 #; maximums of the two pairs
488 shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
491 #; now every float in %xmm0 is the same value, current maximum value
493 #; Next we need to post-process all remaining frames
494 #; the remaining frame count is in %ecx
496 #; if no remaining frames, jump to the end
498 andl $3, %ecx #; nframes % 4
507 addl $4, %edi #; buf++;
509 decl %ecx #; nframes--;
514 #; Load the value from xmm0 to the float stack for returning
515 movss %xmm0, 16(%ebp)
524 .size x86_sse_compute_peak, .-x86_sse_compute_peak
528 .section .note.GNU-stack,"",%progbits