2 Copyright (C) 2005-2006 Sampo Savolainen, John Rigg
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
24 .globl x86_sse_mix_buffers_with_gain
25 .type x86_sse_mix_buffers_with_gain,@function
27 x86_sse_mix_buffers_with_gain:
31 #; %rdx unsigned int nframes
42 #; if nframes == 0, go to end
46 #; Check for alignment
49 andq $12, %rax #; mask alignment offset
52 andq $12, %rbx #; mask alignment offset
55 jne .MBWG_NONALIGN #; if not aligned, calculate manually
61 #; Pre-loop, we need to run 1-3 frames "manually" without
66 #; gain is already in %xmm0
72 addq $4, %rdi #; dst++
73 addq $4, %rsi #; src++
74 decq %rdx #; nframes--
79 cmp $16, %rbx #; test if we've reached 16 byte alignment
85 cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
86 jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
88 #; gain is already in %xmm0
89 shufps $0x00, %xmm0, %xmm0
94 movaps (%rsi), %xmm1 #; source => xmm0
95 mulps %xmm0, %xmm1 #; apply gain to source
96 addps (%rdi), %xmm1 #; mix with destination
97 movaps %xmm1, (%rdi) #; copy result to destination
99 addq $16, %rdi #; dst+=4
100 addq $16, %rsi #; src+=4
102 subq $4, %rdx #; nframes-=4
109 #; if there are remaining frames, the nonalign code will do nicely
110 #; for the rest 1-3 frames.
115 #; gain is already in %xmm0
128 jnz .MBWG_NONALIGNLOOP
140 .size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
143 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
145 .globl x86_sse_mix_buffers_no_gain
146 .type x86_sse_mix_buffers_no_gain,@function
148 x86_sse_mix_buffers_no_gain:
152 #; %rdx unsigned int nframes
157 #; save the registers
164 #; if nframes == 0, go to end
168 #; Check for alignment
171 andq $12, %rax #; mask alignment offset
174 andq $12, %rbx #; mask alignment offset
177 jne .MBNG_NONALIGN #; if not aligned, calculate manually
182 #; Pre-loop, we need to run 1-3 frames "manually" without
191 addq $4, %rdi #; dst++
192 addq $4, %rsi #; src++
193 decq %rdx #; nframes--
197 cmp $16, %rbx #; test if we've reached 16 byte alignment
202 cmp $4, %rdx #; if there are frames left, but less than 4
203 jnge .MBNG_NONALIGN #; we can't run SSE
207 movaps (%rsi), %xmm0 #; source => xmm0
208 addps (%rdi), %xmm0 #; mix with destination
209 movaps %xmm0, (%rdi) #; copy result to destination
211 addq $16, %rdi #; dst+=4
212 addq $16, %rsi #; src+=4
214 subq $4, %rdx #; nframes-=4
221 #; if there are remaining frames, the nonalign code will do nicely
222 #; for the rest 1-3 frames.
227 movss (%rsi), %xmm0 #; src => xmm0
228 addss (%rdi), %xmm0 #; xmm0 += dst
229 movss %xmm0, (%rdi) #; xmm0 => dst
247 .size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
250 #; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
252 .globl x86_sse_apply_gain_to_buffer
253 .type x86_sse_apply_gain_to_buffer,@function
255 x86_sse_apply_gain_to_buffer:
257 #; %rdi float *buf 32(%rbp)
258 #; %rsi unsigned int nframes
260 #; %xmm1 float buf[0]
270 #; if nframes == 0, go to end
271 movq %rsi, %rcx #; nframes
275 #; set up the gain buffer (gain is already in %xmm0)
276 shufps $0x00, %xmm0, %xmm0
278 #; Check for alignment
280 movq %rdi, %rdx #; buf => %rdx
281 andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
282 jz .AG_SSE #; if buffer IS aligned
285 #; we iterate 1-3 times, doing normal x87 float comparison
286 #; so we reach a 16 byte aligned "buf" (=%rdi) value
290 #; Load next value from the buffer into %xmm1
295 #; increment buffer, decrement counter
296 addq $4, %rdi #; buf++;
298 decq %rcx #; nframes--
299 jz .AG_END #; if we run out of frames, we go to the end
301 addq $4, %rdx #; one non-aligned byte less
303 jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
307 #; We have reached the 16 byte aligned "buf" ("rdi") value
309 #; Figure out how many loops we should do
310 movq %rcx, %rax #; copy remaining nframes to %rax for division
311 movq $0, %rdx #; 0 the edx register
316 divq %rdi #; %rdx = remainder == 0
319 #; %rax = SSE iterations
331 subq $4, %rcx #; nframes-=4
336 #; Next we need to post-process all remaining frames
337 #; the remaining frame count is in %rcx
339 #; if no remaining frames, jump to the end
341 andq $3, %rcx #; nframes % 4
350 #; increment buffer, decrement counter
351 addq $4, %rdi #; buf++;
353 decq %rcx #; nframes--
354 jnz .AGPOST_START #; if we run out of frames, we go to the end
365 .size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
369 #; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
371 .globl x86_sse_apply_gain_vector
372 .type x86_sse_apply_gain_vector,@function
374 x86_sse_apply_gain_vector:
377 #; %rsi float *gain_vector
378 #; %rdx unsigned int nframes
388 #; if nframes == 0 go to end
403 jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
405 #; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
408 movss (%rdi), %xmm0 #; buf => xmm0
409 movss (%rsi), %xmm1 #; gain value => xmm1
410 mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
411 movss %xmm0, (%rdi) #; signal with gain => buf
416 addq $4, %rdi #; buf++
417 addq $4, %rsi #; gab++
423 #; There are frames left for sure, as that is checked in the beginning
424 #; and within the previous loop. BUT, there might be less than 4 frames
428 movq %rdx, %rax #; nframes => %rax
429 shr $2, %rax #; unsigned divide by 4
431 cmp $0, %rax #; Jos toimii ilman t�t�, niin kiva
446 andq $3, %rdx #; Remaining frames are nframes & 3
450 #; Inside this loop, we know there are frames left to process
451 #; but because either there are < 4 frames left, or the buffers
452 #; are not aligned, we can't use the parallel SSE ops
454 movss (%rdi), %xmm0 #; buf => xmm0
455 movss (%rsi), %xmm1 #; gain value => xmm1
456 mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
457 movss %xmm0, (%rdi) #; signal with gain => buf
461 decq %rdx #; nframes--
473 .size x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector
477 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
479 .globl x86_sse_compute_peak
480 .type x86_sse_compute_peak,@function
486 x86_sse_compute_peak:
488 #; %rdi float *buf 32(%rbp)
489 #; %rsi unsigned int nframes
490 #; %xmm0 float current
491 #; %xmm1 float buf[0]
499 #; if nframes == 0, go to end
500 movq %rsi, %rcx #; nframes
504 #; create the "abs" mask in %xmm2
505 movss abs_mask, %xmm2
506 shufps $0x00, %xmm2, %xmm2
508 #; Check for alignment
510 #;movq 8(%rbp), %rdi #; buf
511 movq %rdi, %rdx #; buf => %rdx
512 andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
513 jz .CP_SSE #; if buffer IS aligned
516 #; we iterate 1-3 times, doing normal x87 float comparison
517 #; so we reach a 16 byte aligned "buf" (=%rdi) value
521 #; Load next value from the buffer
526 #; increment buffer, decrement counter
527 addq $4, %rdi #; buf++;
529 decq %rcx #; nframes--
530 jz .CP_END #; if we run out of frames, we go to the end
532 addq $4, %rdx #; one non-aligned byte less
534 jne .LP_START #; if more non-aligned frames exist, we do a do-over
538 #; We have reached the 16 byte aligned "buf" ("rdi") value
540 #; Figure out how many loops we should do
541 movq %rcx, %rax #; copy remaining nframes to %rax for division
543 shr $2,%rax #; unsigned divide by 4
546 #; %rax = SSE iterations
548 #; current maximum is at %xmm0, but we need to ..
549 shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
551 #;prefetcht0 16(%rdi)
564 #; Calculate the maximum value contained in the 4 FP's in %xmm0
566 shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
567 maxps %xmm1, %xmm0 #; maximums of the two pairs
569 shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
572 #; now every float in %xmm0 is the same value, current maximum value
574 #; Next we need to post-process all remaining frames
575 #; the remaining frame count is in %rcx
577 #; if no remaining frames, jump to the end
579 andq $3, %rcx #; nframes % 4
588 addq $4, %rdi #; buf++;
590 decq %rcx #; nframes--;
601 .size x86_sse_compute_peak, .-x86_sse_compute_peak