libs/ardour/sse_functions.s

   1 /*
   2  * Copyright (C) 2005-2006 Sampo Savolainen <v2@iki.fi>
   3  * Copyright (C) 2006-2008 Paul Davis <paul@linuxaudiosystems.com>
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18  */
  19
  20
  21 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain);
  22
  23 .globl x86_sse_mix_buffers_with_gain
  24         .type   x86_sse_mix_buffers_with_gain,@function
  25
  26 x86_sse_mix_buffers_with_gain:
  27 #; 8(%ebp)      = float *dst    = %edi
  28 #; 12(%ebp) = float *src        = %esi
  29 #; 16(%ebp) = long      nframes = %ecx
  30 #; 20(%ebp) = float     gain    = st(0)
  31
  32         pushl %ebp
  33         movl %esp, %ebp
  34
  35         #; save the registers
  36 #;      pushl %eax
  37         pushl %ebx
  38 #;      pushl %ecx
  39         pushl %edi
  40         pushl %esi
  41
  42         #; if nframes == 0, go to end
  43         movl 16(%ebp), %ecx #; nframes
  44         cmp     $0, %ecx
  45         je      .MBWG_END
  46
  47         #; Check for alignment
  48
  49         movl 8(%ebp), %edi  #; dst
  50         movl 12(%ebp), %esi #; src
  51
  52         movl %edi, %eax
  53         andl $12, %eax #; mask alignemnt offset
  54
  55         movl %esi, %ebx
  56         andl $12, %ebx #; mask alignment offset
  57
  58         cmp %eax, %ebx
  59         jne .MBWG_NONALIGN #; if not aligned, calculate manually
  60
  61         #; if we are aligned
  62         cmp $0, %ebx
  63         jz .MBWG_SSE
  64
  65         #; Pre-loop, we need to run 1-3 frames "manually" without
  66         #; SSE instructions
  67
  68         movss 20(%ebp), %xmm1 #; xmm1
  69
  70 .MBWG_PRELOOP:
  71
  72         movss (%esi), %xmm0
  73         mulss %xmm1, %xmm0
  74         addss (%edi), %xmm0
  75         movss %xmm0, (%edi)
  76
  77         addl $4, %edi #; dst++
  78         addl $4, %esi #; src++
  79         decl %ecx         #; nframes--
  80         jz .MBWG_END
  81
  82 #;      cmp $0, %ecx
  83 #;      je .MBWG_END #; if we run out of frames, go to end
  84
  85         addl $4, %ebx
  86
  87         cmp $16, %ebx #; test if we've reached 16 byte alignment
  88         jne .MBWG_PRELOOP
  89
  90
  91 .MBWG_SSE:
  92
  93         cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
  94         jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
  95
  96         #; copy gain to fill %xmm1
  97         movss   20(%ebp), %xmm1
  98     shufps  $0x00, %xmm1, %xmm1
  99
 100
 101 .MBWG_SSELOOP:
 102
 103         movaps  (%esi), %xmm0 #; source => xmm0
 104         mulps   %xmm1,  %xmm0 #; apply gain to source
 105         addps   (%edi), %xmm0 #; mix with destination
 106         movaps  %xmm0, (%edi) #; copy result to destination
 107
 108         addl $16, %edi #; dst+=4
 109         addl $16, %esi #; src+=4
 110
 111         subl $4, %ecx #; nframes-=4
 112         cmp $4, %ecx
 113         jge .MBWG_SSELOOP
 114
 115         cmp $0, %ecx
 116         je .MBWG_END
 117
 118         #; if there are remaining frames, the nonalign code will do nicely
 119         #; for the rest 1-3 frames.
 120
 121 .MBWG_NONALIGN:
 122         #; not aligned!
 123
 124         movss 20(%ebp), %xmm1 #; gain => xmm1
 125
 126 .MBWG_NONALIGNLOOP:
 127
 128         movss (%esi), %xmm0
 129         mulss %xmm1, %xmm0
 130         addss (%edi), %xmm0
 131         movss %xmm0, (%edi)
 132
 133         addl $4, %edi
 134         addl $4, %esi
 135
 136         decl %ecx
 137         jnz .MBWG_NONALIGNLOOP
 138
 139 .MBWG_END:
 140
 141         popl %esi
 142         popl %edi
 143 #;      popl %ecx
 144         popl %ebx
 145 #;      popl %eax
 146
 147         #; return
 148         leave
 149         ret
 150
 151 .size   x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
 152
 153
 154
 155
 156 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
 157
 158 .globl x86_sse_mix_buffers_no_gain
 159         .type   x86_sse_mix_buffers_no_gain,@function
 160
 161 x86_sse_mix_buffers_no_gain:
 162 #; 8(%ebp)      = float *dst    = %edi
 163 #; 12(%ebp) = float *src        = %esi
 164 #; 16(%ebp) = long      nframes = %ecx
 165
 166         pushl %ebp
 167         movl %esp, %ebp
 168
 169         #; save the registers
 170 #;      pushl %eax
 171         pushl %ebx
 172 #;      pushl %ecx
 173         pushl %edi
 174         pushl %esi
 175
 176         #; the real function
 177
 178         #; if nframes == 0, go to end
 179         movl 16(%ebp), %ecx #; nframes
 180         cmp     $0, %ecx
 181         je      .MBNG_END
 182
 183         #; Check for alignment
 184
 185         movl 8(%ebp), %edi  #; dst
 186         movl 12(%ebp), %esi #; src
 187
 188         movl %edi, %eax
 189         andl $12, %eax #; mask alignemnt offset
 190
 191         movl %esi, %ebx
 192         andl $12, %ebx #; mask alignment offset
 193
 194         cmp %eax, %ebx
 195         jne .MBNG_NONALIGN #; if not aligned, calculate manually
 196
 197         cmp $0, %ebx
 198         je .MBNG_SSE
 199
 200         #; Pre-loop, we need to run 1-3 frames "manually" without
 201         #; SSE instructions
 202
 203 .MBNG_PRELOOP:
 204
 205         movss (%esi), %xmm0
 206         addss (%edi), %xmm0
 207         movss %xmm0, (%edi)
 208
 209         addl $4, %edi #; dst++
 210         addl $4, %esi #; src++
 211         decl %ecx         #; nframes--
 212         jz      .MBNG_END
 213         addl $4, %ebx
 214
 215         cmp $16, %ebx #; test if we've reached 16 byte alignment
 216         jne .MBNG_PRELOOP
 217
 218 .MBNG_SSE:
 219
 220         cmp $4, %ecx #; if there are frames left, but less than 4
 221         jnge .MBNG_NONALIGN #; we can't run SSE
 222
 223 .MBNG_SSELOOP:
 224
 225         movaps  (%esi), %xmm0 #; source => xmm0
 226         addps   (%edi), %xmm0 #; mix with destination
 227         movaps  %xmm0, (%edi) #; copy result to destination
 228
 229         addl $16, %edi #; dst+=4
 230         addl $16, %esi #; src+=4
 231
 232         subl $4, %ecx #; nframes-=4
 233         cmp $4, %ecx
 234         jge .MBNG_SSELOOP
 235
 236         cmp $0, %ecx
 237         je .MBNG_END
 238
 239         #; if there are remaining frames, the nonalign code will do nicely
 240         #; for the rest 1-3 frames.
 241
 242 .MBNG_NONALIGN:
 243         #; not aligned!
 244
 245         movss (%esi), %xmm0 #; src => xmm0
 246         addss (%edi), %xmm0 #; xmm0 += dst
 247         movss %xmm0, (%edi) #; xmm0 => dst
 248
 249         addl $4, %edi
 250         addl $4, %esi
 251
 252         decl %ecx
 253         jnz .MBNG_NONALIGN
 254
 255 .MBNG_END:
 256
 257         popl %esi
 258         popl %edi
 259 #;      popl %ecx
 260         popl %ebx
 261 #;      popl %eax
 262
 263         #; return
 264         leave
 265         ret
 266
 267 .size   x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
 268
 269
 270
 271
 272 #; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
 273
 274 .globl x86_sse_apply_gain_to_buffer
 275         .type   x86_sse_apply_gain_to_buffer,@function
 276
 277 x86_sse_apply_gain_to_buffer:
 278 #; 8(%ebp)      = float *buf    = %edi
 279 #; 12(%ebp) = long      nframes = %ecx
 280 #; 16(%ebp) = float     gain    = st(0)
 281
 282         pushl %ebp
 283         movl %esp, %ebp
 284
 285         #; save %edi
 286         pushl %edi
 287
 288         #; the real function
 289
 290         #; if nframes == 0, go to end
 291         movl 12(%ebp), %ecx #; nframes
 292         cmp     $0, %ecx
 293         je      .AG_END
 294
 295         #; create the gain buffer in %xmm1
 296         movss   16(%ebp), %xmm1
 297         shufps  $0x00, %xmm1, %xmm1
 298
 299         #; Check for alignment
 300
 301         movl 8(%ebp), %edi #; buf
 302         movl %edi, %edx #; buf => %edx
 303         andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 304         jz      .AG_SSE #; if buffer IS aligned
 305
 306         #; PRE-LOOP
 307         #; we iterate 1-3 times, doing normal x87 float comparison
 308         #; so we reach a 16 byte aligned "buf" (=%edi) value
 309
 310 .AGLP_START:
 311
 312         #; Load next value from the buffer
 313         movss (%edi), %xmm0
 314         mulss %xmm1, %xmm0
 315         movss %xmm0, (%edi)
 316
 317         #; increment buffer, decrement counter
 318         addl $4, %edi #; buf++;
 319
 320         decl %ecx   #; nframes--
 321         jz      .AG_END #; if we run out of frames, we go to the end
 322
 323         addl $4, %edx #; one non-aligned byte less
 324         cmp $16, %edx
 325         jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
 326
 327 .AG_SSE:
 328
 329         #; We have reached the 16 byte aligned "buf" ("edi") value
 330
 331         #; Figure out how many loops we should do
 332         movl %ecx, %eax #; copy remaining nframes to %eax for division
 333         movl $0, %edx   #; 0 the edx register
 334
 335
 336         pushl %edi
 337         movl $4, %edi
 338         divl %edi #; %edx = remainder == 0
 339         popl %edi
 340
 341         #; %eax = SSE iterations
 342         cmp $0, %eax
 343         je .AGPOST_START
 344
 345
 346 .AGLP_SSE:
 347
 348         movaps (%edi), %xmm0
 349         mulps %xmm1, %xmm0
 350         movaps %xmm0, (%edi)
 351
 352         addl $16, %edi
 353 #;      subl $4, %ecx   #; nframes-=4
 354
 355         decl %eax
 356         jnz .AGLP_SSE
 357
 358         #; Next we need to post-process all remaining frames
 359         #; the remaining frame count is in %ecx
 360
 361         #; if no remaining frames, jump to the end
 362 #;      cmp $0, %ecx
 363         andl $3, %ecx #; nframes % 4
 364         je .AG_END
 365
 366 .AGPOST_START:
 367
 368         movss (%edi), %xmm0
 369         mulss %xmm1, %xmm0
 370         movss %xmm0, (%edi)
 371
 372         #; increment buffer, decrement counter
 373         addl $4, %edi #; buf++;
 374
 375         decl %ecx   #; nframes--
 376         jnz     .AGPOST_START #; if we run out of frames, we go to the end
 377
 378 .AG_END:
 379
 380
 381         popl %edi
 382
 383         #; return
 384         leave
 385         ret
 386
 387 .size   x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
 388 #; end proc
 389
 390
 391
 392 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
 393
 394 .globl x86_sse_compute_peak
 395         .type   x86_sse_compute_peak,@function
 396
 397 x86_sse_compute_peak:
 398 #; 8(%ebp)      = float *buf    = %edi
 399 #; 12(%ebp) = long      nframes = %ecx
 400 #; 16(%ebp) = float     current = st(0)
 401
 402         pushl %ebp
 403         movl %esp, %ebp
 404
 405         #; save %edi
 406         pushl %edi
 407
 408         #; the real function
 409
 410         #; Load "current" in xmm0
 411         movss 16(%ebp), %xmm0
 412
 413         #; if nframes == 0, go to end
 414         movl 12(%ebp), %ecx #; nframes
 415         cmp     $0, %ecx
 416         je      .CP_END
 417
 418         #; create the "abs" mask in %xmm2
 419         pushl   $2147483647
 420         movss   (%esp), %xmm2
 421         addl    $4, %esp
 422         shufps  $0x00, %xmm2, %xmm2
 423
 424         #; Check for alignment
 425
 426         movl 8(%ebp), %edi #; buf
 427         movl %edi, %edx #; buf => %edx
 428         andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 429         jz      .CP_SSE #; if buffer IS aligned
 430
 431         #; PRE-LOOP
 432         #; we iterate 1-3 times, doing normal x87 float comparison
 433         #; so we reach a 16 byte aligned "buf" (=%edi) value
 434
 435 .LP_START:
 436
 437         #; Load next value from the buffer
 438         movss (%edi), %xmm1
 439         andps %xmm2, %xmm1
 440         maxss %xmm1, %xmm0
 441
 442         #; increment buffer, decrement counter
 443         addl $4, %edi #; buf++;
 444
 445         decl %ecx   #; nframes--
 446         jz      .CP_END #; if we run out of frames, we go to the end
 447
 448         addl $4, %edx #; one non-aligned byte less
 449         cmp $16, %edx
 450         jne .LP_START #; if more non-aligned frames exist, we do a do-over
 451
 452 .CP_SSE:
 453
 454         #; We have reached the 16 byte aligned "buf" ("edi") value
 455
 456         #; Figure out how many loops we should do
 457         movl %ecx, %eax #; copy remaining nframes to %eax for division
 458
 459         shr $2,%eax #; unsigned divide by 4
 460         jz .POST_START
 461
 462         #; %eax = SSE iterations
 463
 464         #; current maximum is at %xmm0, but we need to ..
 465         shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
 466
 467         #;prefetcht0 16(%edi)
 468
 469 .LP_SSE:
 470
 471         movaps (%edi), %xmm1
 472         andps %xmm2, %xmm1
 473         maxps %xmm1, %xmm0
 474
 475         addl $16, %edi
 476
 477         decl %eax
 478         jnz .LP_SSE
 479
 480         #; Calculate the maximum value contained in the 4 FP's in %xmm0
 481         movaps %xmm0, %xmm1
 482         shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
 483         maxps  %xmm1, %xmm0 #; maximums of the two pairs
 484         movaps %xmm0, %xmm1
 485         shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
 486         maxps  %xmm1, %xmm0
 487
 488         #; now every float in %xmm0 is the same value, current maximum value
 489
 490         #; Next we need to post-process all remaining frames
 491         #; the remaining frame count is in %ecx
 492
 493         #; if no remaining frames, jump to the end
 494
 495         andl $3, %ecx #; nframes % 4
 496         jz .CP_END
 497
 498 .POST_START:
 499
 500         movss (%edi), %xmm1
 501         andps %xmm2, %xmm1
 502         maxss %xmm1, %xmm0
 503
 504         addl $4, %edi   #; buf++;
 505
 506         decl %ecx               #; nframes--;
 507         jnz .POST_START
 508
 509 .CP_END:
 510
 511         #; Load the value from xmm0 to the float stack for returning
 512         movss %xmm0, 16(%ebp)
 513         flds 16(%ebp)
 514
 515         popl %edi
 516
 517         #; return
 518         leave
 519         ret
 520
 521 .size   x86_sse_compute_peak, .-x86_sse_compute_peak
 522 #; end proc
 523
 524 #ifdef __ELF__
 525 .section .note.GNU-stack,"",%progbits
 526 #endif
 527
 528