3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 #; Microsoft version of SSE sample processing functions
20 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
22 .globl x86_sse_mix_buffers_with_gain
23 .def x86_sse_mix_buffers_with_gain; .scl 2; .type 32;
26 x86_sse_mix_buffers_with_gain:
28 #; due to Microsoft calling convention
31 #; %r8 unsigned int nframes
34 #; due to System V AMD64 (Linux) calling convention
37 #; %rdx unsigned int nframes
44 pushq %rbx #; must be preserved
47 pushq %rdi #; must be preserved
48 pushq %rsi #; must be preserved
50 #; to keep algorithms universal - move input params into Linux specific registers
56 #; if nframes == 0, go to end
60 #; Check for alignment
63 andq $12, %rax #; mask alignment offset
66 andq $12, %rbx #; mask alignment offset
69 jne .MBWG_NONALIGN #; if not aligned, calculate manually
75 #; Pre-loop, we need to run 1-3 frames "manually" without
80 #; gain is already in %xmm0
86 addq $4, %rdi #; dst++
87 addq $4, %rsi #; src++
88 decq %rdx #; nframes--
93 cmp $16, %rbx #; test if we've reached 16 byte alignment
99 cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
100 jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
102 #; gain is already in %xmm0
103 shufps $0x00, %xmm0, %xmm0
108 movaps (%rsi), %xmm1 #; source => xmm0
109 mulps %xmm0, %xmm1 #; apply gain to source
110 addps (%rdi), %xmm1 #; mix with destination
111 movaps %xmm1, (%rdi) #; copy result to destination
113 addq $16, %rdi #; dst+=4
114 addq $16, %rsi #; src+=4
116 subq $4, %rdx #; nframes-=4
123 #; if there are remaining frames, the nonalign code will do nicely
124 #; for the rest 1-3 frames.
129 #; gain is already in %xmm0
142 jnz .MBWG_NONALIGNLOOP
157 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
159 .globl x86_sse_mix_buffers_no_gain
160 .def x86_sse_mix_buffers_no_gain; .scl 2; .type 32;
163 x86_sse_mix_buffers_no_gain:
165 #; due to Microsoft calling convention
168 #; %r8 unsigned int nframes
170 #; due to System V AMD64 (Linux) calling convention
173 #; %rdx unsigned int nframes
178 #; save the registers
179 pushq %rbx #; must be preserved
182 pushq %rdi #; must be preserved
183 pushq %rsi #; must be preserved
185 #; to keep algorithms universal - move input params into Linux specific registers
192 #; if nframes == 0, go to end
196 #; Check for alignment
199 andq $12, %rax #; mask alignment offset
202 andq $12, %rbx #; mask alignment offset
205 jne .MBNG_NONALIGN #; if not aligned, calculate manually
210 #; Pre-loop, we need to run 1-3 frames "manually" without
219 addq $4, %rdi #; dst++
220 addq $4, %rsi #; src++
221 decq %rdx #; nframes--
225 cmp $16, %rbx #; test if we've reached 16 byte alignment
230 cmp $4, %rdx #; if there are frames left, but less than 4
231 jnge .MBNG_NONALIGN #; we can't run SSE
235 movaps (%rsi), %xmm0 #; source => xmm0
236 addps (%rdi), %xmm0 #; mix with destination
237 movaps %xmm0, (%rdi) #; copy result to destination
239 addq $16, %rdi #; dst+=4
240 addq $16, %rsi #; src+=4
242 subq $4, %rdx #; nframes-=4
249 #; if there are remaining frames, the nonalign code will do nicely
250 #; for the rest 1-3 frames.
255 movss (%rsi), %xmm0 #; src => xmm0
256 addss (%rdi), %xmm0 #; xmm0 += dst
257 movss %xmm0, (%rdi) #; xmm0 => dst
278 #; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
280 .globl x86_sse_apply_gain_to_buffer
281 .def x86_sse_apply_gain_to_buffer; .scl 2; .type 32;
284 x86_sse_apply_gain_to_buffer:
286 #; due to Microsoft calling convention
287 #; %rcx float *buf 32(%rbp)
288 #; %rdx unsigned int nframes
290 #; %xmm1 float buf[0]
292 #; due to System V AMD64 (Linux) calling convention
293 #; %rdi float *buf 32(%rbp)
294 #; %rsi unsigned int nframes
296 #; %xmm1 float buf[0]
301 #; save the registers
303 pushq %rdi #; must be preserved
304 pushq %rsi #; must be preserved
306 #; to keep algorithms universal - move input params into Linux specific registers
313 #; if nframes == 0, go to end
314 movq %rsi, %rcx #; nframes
318 #; set up the gain buffer (gain is already in %xmm0)
319 shufps $0x00, %xmm0, %xmm0
321 #; Check for alignment
323 movq %rdi, %rdx #; buf => %rdx
324 andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
325 jz .AG_SSE #; if buffer IS aligned
328 #; we iterate 1-3 times, doing normal x87 float comparison
329 #; so we reach a 16 byte aligned "buf" (=%rdi) value
333 #; Load next value from the buffer into %xmm1
338 #; increment buffer, decrement counter
339 addq $4, %rdi #; buf++;
341 decq %rcx #; nframes--
342 jz .AG_END #; if we run out of frames, we go to the end
344 addq $4, %rdx #; one non-aligned byte less
346 jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
350 #; We have reached the 16 byte aligned "buf" ("rdi") value
352 #; Figure out how many loops we should do
353 movq %rcx, %rax #; copy remaining nframes to %rax for division
355 shr $2,%rax #; unsigned divide by 4
357 #; %rax = SSE iterations
367 addq $16, %rdi #; buf + 4
368 subq $4, %rcx #; nframes-=4
373 #; Next we need to post-process all remaining frames
374 #; the remaining frame count is in %rcx
376 andq $3, %rcx #; nframes % 4
385 #; increment buffer, decrement counter
386 addq $4, %rdi #; buf++;
388 decq %rcx #; nframes--
389 jnz .AGPOST_START #; if we run out of frames, we go to the end
404 #; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
406 .globl x86_sse_apply_gain_vector
407 .def x86_sse_apply_gain_vector; .scl 2; .type 32;
411 x86_sse_apply_gain_vector:
413 #; due to Microsoft calling convention
415 #; %rdx float *gain_vector
416 #; %r8 unsigned int nframes
418 #; due to System V AMD64 (Linux) calling convention
420 #; %rsi float *gain_vector
421 #; %rdx unsigned int nframes
426 #; save the registers
427 pushq %rbx #; must be preserved
430 pushq %rdi #; must be preserved
431 pushq %rsi #; must be preserved
433 #; to keep algorithms universal - move input params into Linux specific registers
438 #; if nframes == 0 go to end
453 jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
455 #; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
458 movss (%rdi), %xmm0 #; buf => xmm0
459 movss (%rsi), %xmm1 #; gain value => xmm1
460 mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
461 movss %xmm0, (%rdi) #; signal with gain => buf
466 addq $4, %rdi #; buf++
467 addq $4, %rsi #; gab++
473 #; There are frames left for sure, as that is checked in the beginning
474 #; and within the previous loop. BUT, there might be less than 4 frames
478 movq %rdx, %rax #; nframes => %rax
479 shr $2, %rax #; unsigned divide by 4
496 andq $3, %rdx #; Remaining frames are nframes & 3
500 #; Inside this loop, we know there are frames left to process
501 #; but because either there are < 4 frames left, or the buffers
502 #; are not aligned, we can't use the parallel SSE ops
504 movss (%rdi), %xmm0 #; buf => xmm0
505 movss (%rsi), %xmm1 #; gain value => xmm1
506 mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
507 movss %xmm0, (%rdi) #; signal with gain => buf
511 decq %rdx #; nframes--
528 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
530 .globl x86_sse_compute_peak
531 .def x86_sse_compute_peak; .scl 2; .type 32;
535 x86_sse_compute_peak:
537 #; due to Microsoft calling convention
538 #; %rcx float* buf 32(%rbp)
539 #; %rdx unsigned int nframes
540 #; %xmm2 float current
541 #; %xmm1 float buf[0]
543 #; due to System V AMD64 (Linux) calling convention
544 #; %rdi float* buf 32(%rbp)
545 #; %rsi unsigned int nframes
546 #; %xmm0 float current
547 #; %xmm1 float buf[0]
554 pushq %rdi #; must be preserved
555 pushq %rsi #; must be preserved
557 #; to keep algorithms universal - move input params into Linux specific registers
562 #; if nframes == 0, go to end
563 movq %rsi, %rcx #; nframes
567 #; create the "abs" mask in %xmm2
571 shufps $0x00, %xmm2, %xmm2
573 #; Check for alignment
575 #;movq 8(%rbp), %rdi #; buf
576 movq %rdi, %rdx #; buf => %rdx
577 andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
578 jz .CP_SSE #; if buffer IS aligned
581 #; we iterate 1-3 times, doing normal x87 float comparison
582 #; so we reach a 16 byte aligned "buf" (=%rdi) value
586 #; Load next value from the buffer
591 #; increment buffer, decrement counter
592 addq $4, %rdi #; buf++;
594 decq %rcx #; nframes--
595 jz .CP_END #; if we run out of frames, we go to the end
597 addq $4, %rdx #; one non-aligned byte less
599 jne .LP_START #; if more non-aligned frames exist, we do a do-over
603 #; We have reached the 16 byte aligned "buf" ("rdi") value
605 #; Figure out how many loops we should do
606 movq %rcx, %rax #; copy remaining nframes to %rax for division
608 shr $2,%rax #; unsigned divide by 4
611 #; %rax = SSE iterations
613 #; current maximum is at %xmm0, but we need to ..
614 shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
616 #;prefetcht0 16(%rdi)
626 subq $4, %rcx #; nframes-=4
631 #; Calculate the maximum value contained in the 4 FP's in %xmm0
633 shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
634 maxps %xmm1, %xmm0 #; maximums of the two pairs
636 shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
639 #; now every float in %xmm0 is the same value, current maximum value
641 #; Next we need to post-process all remaining frames
642 #; the remaining frame count is in %rcx
644 #; if no remaining frames, jump to the end
646 andq $3, %rcx #; nframes % 4
655 addq $4, %rdi #; buf++;
657 decq %rcx #; nframes--;
667 #; return value is in xmm0