2 * Copyright (C) 2015 Paul Davis <paul@linuxaudiosystems.com>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 #; Microsoft version of AVX sample processing functions
21 #; void x86_sse_avx_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
23 .globl x86_sse_avx_mix_buffers_with_gain
24 .def x86_sse_avx_mix_buffers_with_gain; .scl 2; .type 32;
27 x86_sse_avx_mix_buffers_with_gain:
29 #; due to Microsoft calling convention
32 #; %r8 unsigned int nframes
39 pushq %rbx #; must be preserved
41 #; move current max to %xmm0 for convenience
44 #; if nframes == 0, go to end
48 #; Check for alignment
51 andq $28, %rax #; mask alignment offset
54 andq $28, %rbx #; mask alignment offset
57 jne .MBWG_NONALIGN #; if buffer are not aligned between each other, calculate manually
63 #; Pre-loop, we need to run 1-7 frames "manually" without
68 #; gain is already in %xmm0
74 addq $4, %rcx #; dst++
75 addq $4, %rdx #; src++
81 cmp $32, %rbx #; test if we've reached 32 byte alignment
86 cmp $8, %r8 #; we know it's not zero, but if it's not >=4, then
87 jl .MBWG_NONALIGN #; we jump straight to the "normal" code
89 #; set up the gain buffer (gain is already in %xmm0)
90 vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
91 vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
95 vmovaps (%rdx), %ymm1 #; source => xmm0
96 vmulps %ymm0, %ymm1, %ymm2 #; apply gain to source
97 vaddps (%rcx), %ymm2, %ymm1 #; mix with destination
98 vmovaps %ymm1, (%rcx) #; copy result to destination
100 addq $32, %rcx #; dst+=8
101 addq $32, %rdx #; src+=8
103 subq $8, %r8 #; nframes-=8
107 #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
113 #; if there are remaining frames, the nonalign code will do nicely
114 #; for the rest 1-7 frames.
119 #; gain is already in %xmm0
132 jnz .MBWG_NONALIGNLOOP
143 #; void x86_sse_avx_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
145 .globl x86_sse_avx_mix_buffers_no_gain
146 .def x86_sse_avx_mix_buffers_no_gain; .scl 2; .type 32;
149 x86_sse_avx_mix_buffers_no_gain:
151 #; due to Microsoft calling convention
154 #; %r8 unsigned int nframes
159 #; save the registers
160 pushq %rbx #; must be preserved
164 #; if nframes == 0, go to end
168 #; Check for alignment
171 andq $28, %rax #; mask alignment offset
174 andq $28, %rbx #; mask alignment offset
177 jne .MBNG_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
180 je .MBNG_AVX #; aligned at 32, rpoceed to AVX
182 #; Pre-loop, we need to run 1-7 frames "manually" without
191 addq $4, %rcx #; dst++
192 addq $4, %rdx #; src++
194 decq %r8 #; nframes--
197 addq $4, %rbx #; one non-aligned byte less
199 cmp $32, %rbx #; test if we've reached 32 byte alignment
204 cmp $8, %r8 #; if there are frames left, but less than 8
205 jl .MBNG_NONALIGN #; we can't run AVX
209 vmovaps (%rdx), %ymm0 #; source => xmm0
210 vaddps (%rcx), %ymm0, %ymm1 #; mix with destination
211 vmovaps %ymm1, (%rcx) #; copy result to destination
213 addq $32, %rcx #; dst+=8
214 addq $32, %rdx #; src+=8
216 subq $8, %r8 #; nframes-=8
220 #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
226 #; if there are remaining frames, the nonalign code will do nicely
227 #; for the rest 1-7 frames.
233 movss (%rdx), %xmm0 #; src => xmm0
234 addss (%rcx), %xmm0 #; xmm0 += dst
235 movss %xmm0, (%rcx) #; xmm0 => dst
252 #; void x86_sse_avx_copy_vector (float *dst, float *src, unsigned int nframes);
254 .globl x86_sse_avx_copy_vector
255 .def x86_sse_avx_copy_vector; .scl 2; .type 32;
258 x86_sse_avx_copy_vector:
260 #; due to Microsoft calling convention
263 #; %r8 unsigned int nframes
268 #; save the registers
269 pushq %rbx #; must be preserved
273 #; if nframes == 0, go to end
277 #; Check for alignment
280 andq $28, %rax #; mask alignment offset
283 andq $28, %rbx #; mask alignment offset
286 jne .CB_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
289 je .CB_AVX #; aligned at 32, rpoceed to AVX
291 #; Pre-loop, we need to run 1-7 frames "manually" without
299 addq $4, %rcx #; dst++
300 addq $4, %rdx #; src++
302 decq %r8 #; nframes--
305 addq $4, %rbx #; one non-aligned byte less
307 cmp $32, %rbx #; test if we've reached 32 byte alignment
312 cmp $8, %r8 #; if there are frames left, but less than 8
313 jl .CB_NONALIGN #; we can't run AVX
317 vmovaps (%rdx), %ymm0 #; source => xmm0
318 vmovaps %ymm0, (%rcx) #; copy result to destination
320 addq $32, %rcx #; dst+=8
321 addq $32, %rdx #; src+=8
323 subq $8, %r8 #; nframes-=8
327 #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
333 #; if there are remaining frames, the nonalign code will do nicely
334 #; for the rest 1-7 frames.
340 movss (%rdx), %xmm0 #; src => xmm0
341 movss %xmm0, (%rcx) #; xmm0 => dst
358 #; void x86_sse_avx_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
360 .globl x86_sse_avx_apply_gain_to_buffer
361 .def x86_sse_avx_apply_gain_to_buffer; .scl 2; .type 32;
364 x86_sse_avx_apply_gain_to_buffer:
366 #; due to Microsoft calling convention
367 #; %rcx float *buf 32(%rbp)
368 #; %rdx unsigned int nframes
369 #; %xmm2 float gain avx specific register
374 #; move current max to %xmm0 for convenience
379 #; if nframes == 0, go to end
383 #; Check for alignment
385 movq %rcx, %r8 #; buf => %rdx
386 andq $28, %r8 #; check alignment with mask 11100
387 jz .AG_AVX #; if buffer IS aligned
390 #; we iterate 1-7 times, doing normal x87 float comparison
391 #; so we reach a 32 byte aligned "buf" (=%rdi) value
395 #; Load next value from the buffer into %xmm1
400 #; increment buffer, decrement counter
401 addq $4, %rcx #; buf++;
403 decq %rdx #; nframes--
404 jz .AG_END #; if we run out of frames, we go to the end
406 addq $4, %r8 #; one non-aligned byte less
408 jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
412 #; We have reached the 32 byte aligned "buf" ("rcx") value
413 #; use AVX instructions
415 #; Figure out how many loops we should do
416 movq %rdx, %rax #; copy remaining nframes to %rax for division
418 shr $3, %rax #; unsigned divide by 8
420 #; %rax = AVX iterations
424 #; set up the gain buffer (gain is already in %xmm0)
425 vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
426 vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
430 vmovaps (%rcx), %ymm1
431 vmulps %ymm0, %ymm1, %ymm2
432 vmovaps %ymm2, (%rcx)
434 addq $32, %rcx #; buf + 8
435 subq $8, %rdx #; nframes-=8
440 #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
443 #; Next we need to post-process all remaining frames
444 #; the remaining frame count is in %rcx
454 #; increment buffer, decrement counter
455 addq $4, %rcx #; buf++;
457 decq %rdx #; nframes--
458 jnz .AGPOST_START #; if we run out of frames, we go to the end
469 #; float x86_sse_avx_compute_peak(float *buf, long nframes, float current);
471 .globl x86_sse_avx_compute_peak
472 .def x86_sse_avx_compute_peak; .scl 2; .type 32;
475 x86_sse_avx_compute_peak:
477 #; due to Microsoft calling convention
478 #; %rcx float* buf 32(%rbp)
479 #; %rdx unsigned int nframes
480 #; %xmm2 float current
485 #; move current max to %xmm0 for convenience
488 #; if nframes == 0, go to end
492 #; create the "abs" mask in %xmm3
493 #; if will be used to discard sign bit
498 #; Check for alignment
499 movq %rcx, %r8 #; buf => %rdx
500 andq $28, %r8 #; mask bits 1 & 2
501 jz .CP_AVX #; if buffer IS aligned
504 #; we iterate 1-7 times, doing normal x87 float comparison
505 #; so we reach a 32 byte aligned "buf" (=%rcx) value
509 #; Load next value from the buffer
511 andps %xmm3, %xmm1 #; mask out sign bit
514 #; increment buffer, decrement counter
515 addq $4, %rcx #; buf++;
517 decq %rdx #; nframes--
518 jz .CP_END #; if we run out of frames, we go to the end
520 addq $4, %r8 #; one non-aligned byte less
522 jne .LP_START #; if more non-aligned frames exist, we do a do-over
526 #; We have reached the 32 byte aligned "buf" ("rdi") value
528 #; Figure out how many loops we should do
529 movq %rdx, %rax #; copy remaining nframes to %rax for division
531 shr $3, %rax #; unsigned divide by 8
534 #; %rax = AVX iterations
536 #; current maximum is at %xmm0, but we need to broadcast it to the whole ymm0 register..
537 vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the all 128 bits of xmm0 register
538 vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
540 #; broadcast sign mask to the whole ymm3 register
541 vshufps $0x00, %ymm3, %ymm3, %ymm3 #; spread single float value to the all 128 bits of xmm3 register
542 vperm2f128 $0x00, %ymm3, %ymm3, %ymm3 #; extend the first 128 bits of ymm3 register to higher 128 bits
546 vmovaps (%rcx), %ymm1
547 vandps %ymm3, %ymm1, %ymm1 #; mask out sign bit
548 vmaxps %ymm1, %ymm0, %ymm0
550 addq $32, %rcx #; buf+=8
551 subq $8, %rdx #; nframes-=8
556 #; Calculate the maximum value contained in the 4 FP's in %ymm0
557 vshufps $0x4e, %ymm0, %ymm0, %ymm1 #; shuffle left & right pairs (1234 => 3412) in each 128 bit half
558 vmaxps %ymm1, %ymm0, %ymm0 #; maximums of the four pairs, if each of 8 elements was unique, 4 unique elements left now
559 vshufps $0xb1, %ymm0, %ymm0, %ymm1 #; shuffle the floats inside pairs (1234 => 2143) in each 128 bit half
560 vmaxps %ymm1, %ymm0, %ymm0 #; maximums of the four pairs, we had up to 4 unique elements was unique, 2 unique elements left now
561 vperm2f128 $0x01, %ymm0, %ymm0, %ymm1 #; swap 128 bit halfs
562 vmaxps %ymm1, %ymm0, %ymm0 #; the result will be - all 8 elemens are maximums
564 #; now every float in %ymm0 is the same value, current maximum value
566 #; Next we need to post-process all remaining frames
567 #; the remaining frame count is in %rcx
569 #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
572 #; if no remaining frames, jump to the end
579 andps %xmm3, %xmm1 #; mask out sign bit
582 addq $4, %rcx #; buf++;
584 decq %rdx #; nframes--;
589 #; return value is in xmm0