libs/ardour/sse_functions.s

   1 /*
   2     Copyright (C) 2005 Paul Davis
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17
  18     $Id$
  19 */
  20
  21
  22 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain);
  23
  24 .globl x86_sse_mix_buffers_with_gain
  25         .type   x86_sse_mix_buffers_with_gain,@function
  26
  27 x86_sse_mix_buffers_with_gain:
  28 #; 8(%ebp)      = float *dst    = %edi
  29 #; 12(%ebp) = float *src        = %esi
  30 #; 16(%ebp) = long      nframes = %ecx
  31 #; 20(%ebp) = float     gain    = st(0)
  32
  33         pushl %ebp
  34         movl %esp, %ebp
  35
  36         #; save the registers
  37 #;      pushl %eax
  38         pushl %ebx
  39 #;      pushl %ecx
  40         pushl %edi
  41         pushl %esi
  42
  43         #; if nframes == 0, go to end
  44         movl 16(%ebp), %ecx #; nframes
  45         cmp     $0, %ecx
  46         je      .MBWG_END
  47
  48         #; Check for alignment
  49
  50         movl 8(%ebp), %edi  #; dst
  51         movl 12(%ebp), %esi #; src
  52
  53         movl %edi, %eax
  54         andl $12, %eax #; mask alignemnt offset
  55
  56         movl %esi, %ebx
  57         andl $12, %ebx #; mask alignment offset
  58
  59         cmp %eax, %ebx
  60         jne .MBWG_NONALIGN #; if not aligned, calculate manually
  61
  62         #; if we are aligned
  63         cmp $0, %ebx
  64         jz .MBWG_SSE
  65
  66         #; Pre-loop, we need to run 1-3 frames "manually" without
  67         #; SSE instructions
  68
  69         movss 20(%ebp), %xmm1 #; xmm1
  70
  71 .MBWG_PRELOOP:
  72
  73         movss (%esi), %xmm0
  74         mulss %xmm1, %xmm0
  75         addss (%edi), %xmm0
  76         movss %xmm0, (%edi)
  77
  78         addl $4, %edi #; dst++
  79         addl $4, %esi #; src++
  80         decl %ecx         #; nframes--
  81         jz .MBWG_END
  82
  83 #;      cmp $0, %ecx
  84 #;      je .MBWG_END #; if we run out of frames, go to end
  85
  86         addl $4, %ebx
  87
  88         cmp $16, %ebx #; test if we've reached 16 byte alignment
  89         jne .MBWG_PRELOOP
  90
  91
  92 .MBWG_SSE:
  93
  94         cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
  95         jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
  96
  97         #; copy gain to fill %xmm1
  98         movss   20(%ebp), %xmm1
  99     shufps  $0x00, %xmm1, %xmm1
 100
 101
 102 .MBWG_SSELOOP:
 103
 104         movaps  (%esi), %xmm0 #; source => xmm0
 105         mulps   %xmm1,  %xmm0 #; apply gain to source
 106         addps   (%edi), %xmm0 #; mix with destination
 107         movaps  %xmm0, (%edi) #; copy result to destination
 108
 109         addl $16, %edi #; dst+=4
 110         addl $16, %esi #; src+=4
 111
 112         subl $4, %ecx #; nframes-=4
 113         cmp $4, %ecx
 114         jge .MBWG_SSELOOP
 115
 116         cmp $0, %ecx
 117         je .MBWG_END
 118
 119         #; if there are remaining frames, the nonalign code will do nicely
 120         #; for the rest 1-3 frames.
 121
 122 .MBWG_NONALIGN:
 123         #; not aligned!
 124
 125         movss 20(%ebp), %xmm1 #; gain => xmm1
 126
 127 .MBWG_NONALIGNLOOP:
 128
 129         movss (%esi), %xmm0
 130         mulss %xmm1, %xmm0
 131         addss (%edi), %xmm0
 132         movss %xmm0, (%edi)
 133
 134         addl $4, %edi
 135         addl $4, %esi
 136
 137         decl %ecx
 138         jnz .MBWG_NONALIGNLOOP
 139
 140 .MBWG_END:
 141
 142         popl %esi
 143         popl %edi
 144 #;      popl %ecx
 145         popl %ebx
 146 #;      popl %eax
 147
 148         #; return
 149         leave
 150         ret
 151
 152 .size   x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
 153
 154
 155
 156
 157 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
 158
 159 .globl x86_sse_mix_buffers_no_gain
 160         .type   x86_sse_mix_buffers_no_gain,@function
 161
 162 x86_sse_mix_buffers_no_gain:
 163 #; 8(%ebp)      = float *dst    = %edi
 164 #; 12(%ebp) = float *src        = %esi
 165 #; 16(%ebp) = long      nframes = %ecx
 166
 167         pushl %ebp
 168         movl %esp, %ebp
 169
 170         #; save the registers
 171 #;      pushl %eax
 172         pushl %ebx
 173 #;      pushl %ecx
 174         pushl %edi
 175         pushl %esi
 176
 177         #; the real function
 178
 179         #; if nframes == 0, go to end
 180         movl 16(%ebp), %ecx #; nframes
 181         cmp     $0, %ecx
 182         je      .MBNG_END
 183
 184         #; Check for alignment
 185
 186         movl 8(%ebp), %edi  #; dst
 187         movl 12(%ebp), %esi #; src
 188
 189         movl %edi, %eax
 190         andl $12, %eax #; mask alignemnt offset
 191
 192         movl %esi, %ebx
 193         andl $12, %ebx #; mask alignment offset
 194
 195         cmp %eax, %ebx
 196         jne .MBNG_NONALIGN #; if not aligned, calculate manually
 197
 198         cmp $0, %ebx
 199         je .MBNG_SSE
 200
 201         #; Pre-loop, we need to run 1-3 frames "manually" without
 202         #; SSE instructions
 203
 204 .MBNG_PRELOOP:
 205
 206         movss (%esi), %xmm0
 207         addss (%edi), %xmm0
 208         movss %xmm0, (%edi)
 209
 210         addl $4, %edi #; dst++
 211         addl $4, %esi #; src++
 212         decl %ecx         #; nframes--
 213         jz      .MBNG_END
 214         addl $4, %ebx
 215
 216         cmp $16, %ebx #; test if we've reached 16 byte alignment
 217         jne .MBNG_PRELOOP
 218
 219 .MBNG_SSE:
 220
 221         cmp $4, %ecx #; if there are frames left, but less than 4
 222         jnge .MBNG_NONALIGN #; we can't run SSE
 223
 224 .MBNG_SSELOOP:
 225
 226         movaps  (%esi), %xmm0 #; source => xmm0
 227         addps   (%edi), %xmm0 #; mix with destination
 228         movaps  %xmm0, (%edi) #; copy result to destination
 229
 230         addl $16, %edi #; dst+=4
 231         addl $16, %esi #; src+=4
 232
 233         subl $4, %ecx #; nframes-=4
 234         cmp $4, %ecx
 235         jge .MBNG_SSELOOP
 236
 237         cmp $0, %ecx
 238         je .MBNG_END
 239
 240         #; if there are remaining frames, the nonalign code will do nicely
 241         #; for the rest 1-3 frames.
 242
 243 .MBNG_NONALIGN:
 244         #; not aligned!
 245
 246         movss (%esi), %xmm0 #; src => xmm0
 247         addss (%edi), %xmm0 #; xmm0 += dst
 248         movss %xmm0, (%edi) #; xmm0 => dst
 249
 250         addl $4, %edi
 251         addl $4, %esi
 252
 253         decl %ecx
 254         jnz .MBNG_NONALIGN
 255
 256 .MBNG_END:
 257
 258         popl %esi
 259         popl %edi
 260 #;      popl %ecx
 261         popl %ebx
 262 #;      popl %eax
 263
 264         #; return
 265         leave
 266         ret
 267
 268 .size   x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
 269
 270
 271
 272
 273 #; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
 274
 275 .globl x86_sse_apply_gain_to_buffer
 276         .type   x86_sse_apply_gain_to_buffer,@function
 277
 278 x86_sse_apply_gain_to_buffer:
 279 #; 8(%ebp)      = float *buf    = %edi
 280 #; 12(%ebp) = long      nframes = %ecx
 281 #; 16(%ebp) = float     gain    = st(0)
 282
 283         pushl %ebp
 284         movl %esp, %ebp
 285
 286         #; save %edi
 287         pushl %edi
 288
 289         #; the real function
 290
 291         #; if nframes == 0, go to end
 292         movl 12(%ebp), %ecx #; nframes
 293         cmp     $0, %ecx
 294         je      .AG_END
 295
 296         #; create the gain buffer in %xmm1
 297         movss   16(%ebp), %xmm1
 298         shufps  $0x00, %xmm1, %xmm1
 299
 300         #; Check for alignment
 301
 302         movl 8(%ebp), %edi #; buf
 303         movl %edi, %edx #; buf => %edx
 304         andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 305         jz      .AG_SSE #; if buffer IS aligned
 306
 307         #; PRE-LOOP
 308         #; we iterate 1-3 times, doing normal x87 float comparison
 309         #; so we reach a 16 byte aligned "buf" (=%edi) value
 310
 311 .AGLP_START:
 312
 313         #; Load next value from the buffer
 314         movss (%edi), %xmm0
 315         mulss %xmm1, %xmm0
 316         movss %xmm0, (%edi)
 317
 318         #; increment buffer, decrement counter
 319         addl $4, %edi #; buf++;
 320
 321         decl %ecx   #; nframes--
 322         jz      .AG_END #; if we run out of frames, we go to the end
 323
 324         addl $4, %edx #; one non-aligned byte less
 325         cmp $16, %edx
 326         jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
 327
 328 .AG_SSE:
 329
 330         #; We have reached the 16 byte aligned "buf" ("edi") value
 331
 332         #; Figure out how many loops we should do
 333         movl %ecx, %eax #; copy remaining nframes to %eax for division
 334         movl $0, %edx   #; 0 the edx register
 335
 336
 337         pushl %edi
 338         movl $4, %edi
 339         divl %edi #; %edx = remainder == 0
 340         popl %edi
 341
 342         #; %eax = SSE iterations
 343         cmp $0, %eax
 344         je .AGPOST_START
 345
 346
 347 .AGLP_SSE:
 348
 349         movaps (%edi), %xmm0
 350         mulps %xmm1, %xmm0
 351         movaps %xmm0, (%edi)
 352
 353         addl $16, %edi
 354 #;      subl $4, %ecx   #; nframes-=4
 355
 356         decl %eax
 357         jnz .AGLP_SSE
 358
 359         #; Next we need to post-process all remaining frames
 360         #; the remaining frame count is in %ecx
 361
 362         #; if no remaining frames, jump to the end
 363 #;      cmp $0, %ecx
 364         andl $3, %ecx #; nframes % 4
 365         je .AG_END
 366
 367 .AGPOST_START:
 368
 369         movss (%edi), %xmm0
 370         mulss %xmm1, %xmm0
 371         movss %xmm0, (%edi)
 372
 373         #; increment buffer, decrement counter
 374         addl $4, %edi #; buf++;
 375
 376         decl %ecx   #; nframes--
 377         jnz     .AGPOST_START #; if we run out of frames, we go to the end
 378
 379 .AG_END:
 380
 381
 382         popl %edi
 383
 384         #; return
 385         leave
 386         ret
 387
 388 .size   x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
 389 #; end proc
 390
 391
 392
 393 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
 394
 395 .globl x86_sse_compute_peak
 396         .type   x86_sse_compute_peak,@function
 397
 398 abs_mask:
 399         .long   2147483647
 400
 401
 402 x86_sse_compute_peak:
 403 #; 8(%ebp)      = float *buf    = %edi
 404 #; 12(%ebp) = long      nframes = %ecx
 405 #; 16(%ebp) = float     current = st(0)
 406
 407         pushl %ebp
 408         movl %esp, %ebp
 409
 410         #; save %edi
 411         pushl %edi
 412
 413         #; the real function
 414
 415         #; Load "current" in xmm0
 416         movss 16(%ebp), %xmm0
 417
 418         #; if nframes == 0, go to end
 419         movl 12(%ebp), %ecx #; nframes
 420         cmp     $0, %ecx
 421         je      .CP_END
 422
 423         #; create the "abs" mask in %xmm2
 424         movss   abs_mask, %xmm2
 425         shufps  $0x00, %xmm2, %xmm2
 426
 427         #; Check for alignment
 428
 429         movl 8(%ebp), %edi #; buf
 430         movl %edi, %edx #; buf => %edx
 431         andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 432         jz      .CP_SSE #; if buffer IS aligned
 433
 434         #; PRE-LOOP
 435         #; we iterate 1-3 times, doing normal x87 float comparison
 436         #; so we reach a 16 byte aligned "buf" (=%edi) value
 437
 438 .LP_START:
 439
 440         #; Load next value from the buffer
 441         movss (%edi), %xmm1
 442         andps %xmm2, %xmm1
 443         maxss %xmm1, %xmm0
 444
 445         #; increment buffer, decrement counter
 446         addl $4, %edi #; buf++;
 447
 448         decl %ecx   #; nframes--
 449         jz      .CP_END #; if we run out of frames, we go to the end
 450
 451         addl $4, %edx #; one non-aligned byte less
 452         cmp $16, %edx
 453         jne .LP_START #; if more non-aligned frames exist, we do a do-over
 454
 455 .CP_SSE:
 456
 457         #; We have reached the 16 byte aligned "buf" ("edi") value
 458
 459         #; Figure out how many loops we should do
 460         movl %ecx, %eax #; copy remaining nframes to %eax for division
 461
 462         shr $2,%eax #; unsigned divide by 4
 463         jz .POST_START
 464
 465         #; %eax = SSE iterations
 466
 467         #; current maximum is at %xmm0, but we need to ..
 468         shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
 469
 470         #;prefetcht0 16(%edi)
 471
 472 .LP_SSE:
 473
 474         movaps (%edi), %xmm1
 475         andps %xmm2, %xmm1
 476         maxps %xmm1, %xmm0
 477
 478         addl $16, %edi
 479
 480         decl %eax
 481         jnz .LP_SSE
 482
 483         #; Calculate the maximum value contained in the 4 FP's in %xmm0
 484         movaps %xmm0, %xmm1
 485         shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
 486         maxps  %xmm1, %xmm0 #; maximums of the two pairs
 487         movaps %xmm0, %xmm1
 488         shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
 489         maxps  %xmm1, %xmm0
 490
 491         #; now every float in %xmm0 is the same value, current maximum value
 492
 493         #; Next we need to post-process all remaining frames
 494         #; the remaining frame count is in %ecx
 495
 496         #; if no remaining frames, jump to the end
 497
 498         andl $3, %ecx #; nframes % 4
 499         jz .CP_END
 500
 501 .POST_START:
 502
 503         movss (%edi), %xmm1
 504         andps %xmm2, %xmm1
 505         maxss %xmm1, %xmm0
 506
 507         addl $4, %edi   #; buf++;
 508
 509         decl %ecx               #; nframes--;
 510         jnz .POST_START
 511
 512 .CP_END:
 513
 514         #; Load the value from xmm0 to the float stack for returning
 515         movss %xmm0, 16(%ebp)
 516         flds 16(%ebp)
 517
 518         popl %edi
 519
 520         #; return
 521         leave
 522         ret
 523
 524 .size   x86_sse_compute_peak, .-x86_sse_compute_peak
 525 #; end proc
 526
 527
 528
 529