2 * Copyright (C) 2005-2006 Sampo Savolainen <v2@iki.fi>
3 * Copyright (C) 2006-2008 Paul Davis <paul@linuxaudiosystems.com>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain);
23 .globl x86_sse_mix_buffers_with_gain
24 .type x86_sse_mix_buffers_with_gain,@function
26 x86_sse_mix_buffers_with_gain:
27 #; 8(%ebp) = float *dst = %edi
28 #; 12(%ebp) = float *src = %esi
29 #; 16(%ebp) = long nframes = %ecx
30 #; 20(%ebp) = float gain = st(0)
42 #; if nframes == 0, go to end
43 movl 16(%ebp), %ecx #; nframes
47 #; Check for alignment
49 movl 8(%ebp), %edi #; dst
50 movl 12(%ebp), %esi #; src
53 andl $12, %eax #; mask alignemnt offset
56 andl $12, %ebx #; mask alignment offset
59 jne .MBWG_NONALIGN #; if not aligned, calculate manually
65 #; Pre-loop, we need to run 1-3 frames "manually" without
68 movss 20(%ebp), %xmm1 #; xmm1
77 addl $4, %edi #; dst++
78 addl $4, %esi #; src++
79 decl %ecx #; nframes--
83 #; je .MBWG_END #; if we run out of frames, go to end
87 cmp $16, %ebx #; test if we've reached 16 byte alignment
93 cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
94 jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
96 #; copy gain to fill %xmm1
98 shufps $0x00, %xmm1, %xmm1
103 movaps (%esi), %xmm0 #; source => xmm0
104 mulps %xmm1, %xmm0 #; apply gain to source
105 addps (%edi), %xmm0 #; mix with destination
106 movaps %xmm0, (%edi) #; copy result to destination
108 addl $16, %edi #; dst+=4
109 addl $16, %esi #; src+=4
111 subl $4, %ecx #; nframes-=4
118 #; if there are remaining frames, the nonalign code will do nicely
119 #; for the rest 1-3 frames.
124 movss 20(%ebp), %xmm1 #; gain => xmm1
137 jnz .MBWG_NONALIGNLOOP
151 .size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
156 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
158 .globl x86_sse_mix_buffers_no_gain
159 .type x86_sse_mix_buffers_no_gain,@function
161 x86_sse_mix_buffers_no_gain:
162 #; 8(%ebp) = float *dst = %edi
163 #; 12(%ebp) = float *src = %esi
164 #; 16(%ebp) = long nframes = %ecx
169 #; save the registers
178 #; if nframes == 0, go to end
179 movl 16(%ebp), %ecx #; nframes
183 #; Check for alignment
185 movl 8(%ebp), %edi #; dst
186 movl 12(%ebp), %esi #; src
189 andl $12, %eax #; mask alignemnt offset
192 andl $12, %ebx #; mask alignment offset
195 jne .MBNG_NONALIGN #; if not aligned, calculate manually
200 #; Pre-loop, we need to run 1-3 frames "manually" without
209 addl $4, %edi #; dst++
210 addl $4, %esi #; src++
211 decl %ecx #; nframes--
215 cmp $16, %ebx #; test if we've reached 16 byte alignment
220 cmp $4, %ecx #; if there are frames left, but less than 4
221 jnge .MBNG_NONALIGN #; we can't run SSE
225 movaps (%esi), %xmm0 #; source => xmm0
226 addps (%edi), %xmm0 #; mix with destination
227 movaps %xmm0, (%edi) #; copy result to destination
229 addl $16, %edi #; dst+=4
230 addl $16, %esi #; src+=4
232 subl $4, %ecx #; nframes-=4
239 #; if there are remaining frames, the nonalign code will do nicely
240 #; for the rest 1-3 frames.
245 movss (%esi), %xmm0 #; src => xmm0
246 addss (%edi), %xmm0 #; xmm0 += dst
247 movss %xmm0, (%edi) #; xmm0 => dst
267 .size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
272 #; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
274 .globl x86_sse_apply_gain_to_buffer
275 .type x86_sse_apply_gain_to_buffer,@function
277 x86_sse_apply_gain_to_buffer:
278 #; 8(%ebp) = float *buf = %edi
279 #; 12(%ebp) = long nframes = %ecx
280 #; 16(%ebp) = float gain = st(0)
290 #; if nframes == 0, go to end
291 movl 12(%ebp), %ecx #; nframes
295 #; create the gain buffer in %xmm1
296 movss 16(%ebp), %xmm1
297 shufps $0x00, %xmm1, %xmm1
299 #; Check for alignment
301 movl 8(%ebp), %edi #; buf
302 movl %edi, %edx #; buf => %edx
303 andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
304 jz .AG_SSE #; if buffer IS aligned
307 #; we iterate 1-3 times, doing normal x87 float comparison
308 #; so we reach a 16 byte aligned "buf" (=%edi) value
312 #; Load next value from the buffer
317 #; increment buffer, decrement counter
318 addl $4, %edi #; buf++;
320 decl %ecx #; nframes--
321 jz .AG_END #; if we run out of frames, we go to the end
323 addl $4, %edx #; one non-aligned byte less
325 jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
329 #; We have reached the 16 byte aligned "buf" ("edi") value
331 #; Figure out how many loops we should do
332 movl %ecx, %eax #; copy remaining nframes to %eax for division
333 movl $0, %edx #; 0 the edx register
338 divl %edi #; %edx = remainder == 0
341 #; %eax = SSE iterations
353 #; subl $4, %ecx #; nframes-=4
358 #; Next we need to post-process all remaining frames
359 #; the remaining frame count is in %ecx
361 #; if no remaining frames, jump to the end
363 andl $3, %ecx #; nframes % 4
372 #; increment buffer, decrement counter
373 addl $4, %edi #; buf++;
375 decl %ecx #; nframes--
376 jnz .AGPOST_START #; if we run out of frames, we go to the end
387 .size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
392 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
394 .globl x86_sse_compute_peak
395 .type x86_sse_compute_peak,@function
397 x86_sse_compute_peak:
398 #; 8(%ebp) = float *buf = %edi
399 #; 12(%ebp) = long nframes = %ecx
400 #; 16(%ebp) = float current = st(0)
410 #; Load "current" in xmm0
411 movss 16(%ebp), %xmm0
413 #; if nframes == 0, go to end
414 movl 12(%ebp), %ecx #; nframes
418 #; create the "abs" mask in %xmm2
422 shufps $0x00, %xmm2, %xmm2
424 #; Check for alignment
426 movl 8(%ebp), %edi #; buf
427 movl %edi, %edx #; buf => %edx
428 andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
429 jz .CP_SSE #; if buffer IS aligned
432 #; we iterate 1-3 times, doing normal x87 float comparison
433 #; so we reach a 16 byte aligned "buf" (=%edi) value
437 #; Load next value from the buffer
442 #; increment buffer, decrement counter
443 addl $4, %edi #; buf++;
445 decl %ecx #; nframes--
446 jz .CP_END #; if we run out of frames, we go to the end
448 addl $4, %edx #; one non-aligned byte less
450 jne .LP_START #; if more non-aligned frames exist, we do a do-over
454 #; We have reached the 16 byte aligned "buf" ("edi") value
456 #; Figure out how many loops we should do
457 movl %ecx, %eax #; copy remaining nframes to %eax for division
459 shr $2,%eax #; unsigned divide by 4
462 #; %eax = SSE iterations
464 #; current maximum is at %xmm0, but we need to ..
465 shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
467 #;prefetcht0 16(%edi)
480 #; Calculate the maximum value contained in the 4 FP's in %xmm0
482 shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
483 maxps %xmm1, %xmm0 #; maximums of the two pairs
485 shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
488 #; now every float in %xmm0 is the same value, current maximum value
490 #; Next we need to post-process all remaining frames
491 #; the remaining frame count is in %ecx
493 #; if no remaining frames, jump to the end
495 andl $3, %ecx #; nframes % 4
504 addl $4, %edi #; buf++;
506 decl %ecx #; nframes--;
511 #; Load the value from xmm0 to the float stack for returning
512 movss %xmm0, 16(%ebp)
521 .size x86_sse_compute_peak, .-x86_sse_compute_peak
525 .section .note.GNU-stack,"",%progbits