libs/ardour/sse_functions_64bit_win.s

   1 /*
   2  *
   3  * This program is free software; you can redistribute it and/or modify
   4  * it under the terms of the GNU General Public License as published by
   5  * the Free Software Foundation; either version 2 of the License, or
   6  * (at your option) any later version.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11  * GNU General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public License along
  14  * with this program; if not, write to the Free Software Foundation, Inc.,
  15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  16  */
  17
  18 #; Microsoft version of SSE sample processing functions
  19
  20 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
  21
  22 .globl x86_sse_mix_buffers_with_gain
  23         .def    x86_sse_mix_buffers_with_gain; .scl    2;      .type   32;
  24 .endef
  25
  26 x86_sse_mix_buffers_with_gain:
  27
  28 #; due to Microsoft calling convention
  29 #; %rcx float *dst
  30 #; %rdx float *src
  31 #; %r8 unsigned int nframes
  32 #; %xmm3 float  gain
  33
  34 #; due to System V AMD64 (Linux) calling convention
  35 #; %rdi float   *dst
  36 #; %rsi float   *src
  37 #; %rdx unsigned int nframes
  38 #; %xmm0 float  gain
  39
  40         pushq %rbp
  41         movq %rsp, %rbp
  42
  43         #; save the registers
  44         pushq %rbx #; must be preserved
  45         pushq %rcx
  46         pushq %rdx
  47         pushq %rdi #; must be preserved
  48         pushq %rsi #; must be preserved
  49
  50         #; to keep algorithms universal - move input params into Linux specific registers
  51         movq %rcx, %rdi
  52         movq %rdx, %rsi
  53         movq %r8, %rdx
  54         movss %xmm3, %xmm0
  55
  56         #; if nframes == 0, go to end
  57         cmp     $0, %rdx
  58         je      .MBWG_END
  59
  60         #; Check for alignment
  61
  62         movq %rdi, %rax
  63         andq $12, %rax #; mask alignment offset
  64
  65         movq %rsi, %rbx
  66         andq $12, %rbx #; mask alignment offset
  67
  68         cmp %rax, %rbx
  69         jne .MBWG_NONALIGN #; if not aligned, calculate manually
  70
  71         #; if we are aligned
  72         cmp $0, %rbx
  73         jz .MBWG_SSE
  74
  75         #; Pre-loop, we need to run 1-3 frames "manually" without
  76         #; SSE instructions
  77
  78 .MBWG_PRELOOP:
  79
  80         #; gain is already in %xmm0
  81         movss (%rsi), %xmm1
  82         mulss %xmm0, %xmm1
  83         addss (%rdi), %xmm1
  84         movss %xmm1, (%rdi)
  85
  86         addq $4, %rdi #; dst++
  87         addq $4, %rsi #; src++
  88         decq %rdx         #; nframes--
  89         jz .MBWG_END
  90
  91         addq $4, %rbx
  92
  93         cmp $16, %rbx #; test if we've reached 16 byte alignment
  94         jne .MBWG_PRELOOP
  95
  96
  97 .MBWG_SSE:
  98
  99         cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
 100         jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
 101
 102         #; gain is already in %xmm0
 103         shufps  $0x00, %xmm0, %xmm0
 104
 105
 106 .MBWG_SSELOOP:
 107
 108         movaps  (%rsi), %xmm1 #; source => xmm0
 109         mulps   %xmm0,  %xmm1 #; apply gain to source
 110         addps   (%rdi), %xmm1 #; mix with destination
 111         movaps  %xmm1, (%rdi) #; copy result to destination
 112
 113         addq $16, %rdi #; dst+=4
 114         addq $16, %rsi #; src+=4
 115
 116         subq $4, %rdx #; nframes-=4
 117         cmp $4, %rdx
 118         jge .MBWG_SSELOOP
 119
 120         cmp $0, %rdx
 121         je .MBWG_END
 122
 123         #; if there are remaining frames, the nonalign code will do nicely
 124         #; for the rest 1-3 frames.
 125
 126 .MBWG_NONALIGN:
 127         #; not aligned!
 128
 129         #; gain is already in %xmm0
 130
 131 .MBWG_NONALIGNLOOP:
 132
 133         movss (%rsi), %xmm1
 134         mulss %xmm0, %xmm1
 135         addss (%rdi), %xmm1
 136         movss %xmm1, (%rdi)
 137
 138         addq $4, %rdi
 139         addq $4, %rsi
 140
 141         decq %rdx
 142         jnz .MBWG_NONALIGNLOOP
 143
 144 .MBWG_END:
 145
 146         popq %rsi
 147         popq %rdi
 148         popq %rdx
 149         popq %rcx
 150         popq %rbx
 151
 152         #; return
 153         leave
 154         ret
 155
 156
 157 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
 158
 159 .globl x86_sse_mix_buffers_no_gain
 160         .def    x86_sse_mix_buffers_no_gain; .scl    2;   .type   32;
 161 .endef
 162
 163 x86_sse_mix_buffers_no_gain:
 164
 165 #; due to Microsoft calling convention
 166 #; %rcx float *dst
 167 #; %rdx float *src
 168 #; %r8 unsigned int nframes
 169
 170 #; due to System V AMD64 (Linux) calling convention
 171 #; %rdi float *dst
 172 #; %rsi float *src
 173 #; %rdx unsigned int nframes
 174
 175         pushq %rbp
 176         movq %rsp, %rbp
 177
 178         #; save the registers
 179         pushq %rbx #; must be preserved
 180         pushq %rcx
 181         pushq %rdx
 182         pushq %rdi #; must be preserved
 183         pushq %rsi #; must be preserved
 184
 185         #; to keep algorithms universal - move input params into Linux specific registers
 186         movq %rcx, %rdi
 187         movq %rdx, %rsi
 188         movq %r8, %rdx
 189
 190         #; the real function
 191
 192         #; if nframes == 0, go to end
 193         cmp     $0, %r8
 194         je      .MBNG_END
 195
 196         #; Check for alignment
 197
 198         movq %rdi, %rax
 199         andq $12, %rax #; mask alignment offset
 200
 201         movq %rsi, %rbx
 202         andq $12, %rbx #; mask alignment offset
 203
 204         cmp %rax, %rbx
 205         jne .MBNG_NONALIGN #; if not aligned, calculate manually
 206
 207         cmp $0, %rbx
 208         je .MBNG_SSE
 209
 210         #; Pre-loop, we need to run 1-3 frames "manually" without
 211         #; SSE instructions
 212
 213 .MBNG_PRELOOP:
 214
 215         movss (%rsi), %xmm0
 216         addss (%rdi), %xmm0
 217         movss %xmm0, (%rdi)
 218
 219         addq $4, %rdi #; dst++
 220         addq $4, %rsi #; src++
 221         decq %rdx         #; nframes--
 222         jz      .MBNG_END
 223         addq $4, %rbx
 224
 225         cmp $16, %rbx #; test if we've reached 16 byte alignment
 226         jne .MBNG_PRELOOP
 227
 228 .MBNG_SSE:
 229
 230         cmp $4, %rdx #; if there are frames left, but less than 4
 231         jnge .MBNG_NONALIGN #; we can't run SSE
 232
 233 .MBNG_SSELOOP:
 234
 235         movaps  (%rsi), %xmm0 #; source => xmm0
 236         addps   (%rdi), %xmm0 #; mix with destination
 237         movaps  %xmm0, (%rdi) #; copy result to destination
 238
 239         addq $16, %rdi #; dst+=4
 240         addq $16, %rsi #; src+=4
 241
 242         subq $4, %rdx #; nframes-=4
 243         cmp $4, %rdx
 244         jge .MBNG_SSELOOP
 245
 246         cmp $0, %rdx
 247         je .MBNG_END
 248
 249         #; if there are remaining frames, the nonalign code will do nicely
 250         #; for the rest 1-3 frames.
 251
 252 .MBNG_NONALIGN:
 253         #; not aligned!
 254
 255         movss (%rsi), %xmm0 #; src => xmm0
 256         addss (%rdi), %xmm0 #; xmm0 += dst
 257         movss %xmm0, (%rdi) #; xmm0 => dst
 258
 259         addq $4, %rdi
 260         addq $4, %rsi
 261
 262         decq %rdx
 263         jnz .MBNG_NONALIGN
 264
 265 .MBNG_END:
 266
 267         popq %rsi
 268         popq %rdi
 269         popq %rdx
 270         popq %rcx
 271         popq %rbx
 272
 273         #; return
 274         leave
 275         ret
 276
 277
 278 #; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
 279
 280 .globl x86_sse_apply_gain_to_buffer
 281         .def    x86_sse_apply_gain_to_buffer; .scl    2;   .type   32;
 282 .endef
 283
 284 x86_sse_apply_gain_to_buffer:
 285
 286 #; due to Microsoft calling convention
 287 #; %rcx float                   *buf    32(%rbp)
 288 #; %rdx unsigned int    nframes
 289 #; %xmm2 float                  gain
 290 #; %xmm1 float                  buf[0]
 291
 292 #; due to System V AMD64 (Linux) calling convention
 293 #; %rdi  float                  *buf    32(%rbp)
 294 #; %rsi  unsigned int   nframes
 295 #; %xmm0 float                  gain
 296 #; %xmm1 float                  buf[0]
 297
 298         pushq %rbp
 299         movq %rsp, %rbp
 300
 301         #; save the registers
 302         pushq %rcx
 303         pushq %rdi #; must be preserved
 304         pushq %rsi #; must be preserved
 305
 306         #; to keep algorithms universal - move input params into Linux specific registers
 307         movq %rcx, %rdi
 308         movq %rdx, %rsi
 309         movss %xmm2, %xmm0
 310
 311         #; the real function
 312
 313         #; if nframes == 0, go to end
 314         movq %rsi, %rcx #; nframes
 315         cmp     $0, %rcx
 316         je      .AG_END
 317
 318         #; set up the gain buffer (gain is already in %xmm0)
 319         shufps  $0x00, %xmm0, %xmm0
 320
 321         #; Check for alignment
 322
 323         movq %rdi, %rdx #; buf => %rdx
 324         andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 325         jz      .AG_SSE #; if buffer IS aligned
 326
 327         #; PRE-LOOP
 328         #; we iterate 1-3 times, doing normal x87 float comparison
 329         #; so we reach a 16 byte aligned "buf" (=%rdi) value
 330
 331 .AGLP_START:
 332
 333         #; Load next value from the buffer into %xmm1
 334         movss (%rdi), %xmm1
 335         mulss %xmm0, %xmm1
 336         movss %xmm1, (%rdi)
 337
 338         #; increment buffer, decrement counter
 339         addq $4, %rdi #; buf++;
 340
 341         decq %rcx   #; nframes--
 342         jz      .AG_END #; if we run out of frames, we go to the end
 343
 344         addq $4, %rdx #; one non-aligned byte less
 345         cmp $16, %rdx
 346         jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
 347
 348 .AG_SSE:
 349
 350         #; We have reached the 16 byte aligned "buf" ("rdi") value
 351
 352         #; Figure out how many loops we should do
 353         movq %rcx, %rax #; copy remaining nframes to %rax for division
 354
 355         shr $2,%rax #; unsigned divide by 4
 356
 357         #; %rax = SSE iterations
 358         cmp $0, %rax
 359         je .AGPOST_START
 360
 361 .AGLP_SSE:
 362
 363         movaps (%rdi), %xmm1
 364         mulps %xmm0, %xmm1
 365         movaps %xmm1, (%rdi)
 366
 367         addq $16, %rdi  #; buf + 4
 368         subq $4, %rcx   #; nframes-=4
 369
 370         decq %rax
 371         jnz .AGLP_SSE
 372
 373         #; Next we need to post-process all remaining frames
 374         #; the remaining frame count is in %rcx
 375
 376         andq $3, %rcx #; nframes % 4
 377         jz .AG_END
 378
 379 .AGPOST_START:
 380
 381         movss (%rdi), %xmm1
 382         mulss %xmm0, %xmm1
 383         movss %xmm1, (%rdi)
 384
 385         #; increment buffer, decrement counter
 386         addq $4, %rdi #; buf++;
 387
 388         decq %rcx   #; nframes--
 389         jnz     .AGPOST_START #; if we run out of frames, we go to the end
 390
 391 .AG_END:
 392
 393         popq %rsi
 394         popq %rdi
 395         popq %rcx
 396
 397         #; return
 398         leave
 399         ret
 400
 401 #; end proc
 402
 403
 404 #; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
 405
 406 .globl x86_sse_apply_gain_vector
 407         .def    x86_sse_apply_gain_vector; .scl    2;   .type   32;
 408 .endef
 409
 410
 411 x86_sse_apply_gain_vector:
 412
 413 #; due to Microsoft calling convention
 414 #; %rcx float *buf
 415 #; %rdx float *gain_vector
 416 #; %r8  unsigned int nframes
 417
 418 #; due to System V AMD64 (Linux) calling convention
 419 #; %rdi float *buf
 420 #; %rsi float *gain_vector
 421 #; %rdx unsigned int nframes
 422
 423         pushq %rbp
 424         movq %rsp, %rbp
 425
 426         #; save the registers
 427         pushq %rbx #; must be preserved
 428         pushq %rcx
 429         pushq %rdx
 430         pushq %rdi #; must be preserved
 431         pushq %rsi #; must be preserved
 432
 433         #; to keep algorithms universal - move input params into Linux specific registers
 434         movq %rcx, %rdi
 435         movq %rdx, %rsi
 436         movq %r8, %rdx
 437
 438         #; if nframes == 0 go to end
 439         cmp $0, %rdx
 440         je .AGA_END
 441
 442         #; Check alignment
 443         movq %rdi, %rax
 444         andq $12, %rax
 445
 446         movq %rsi, %rbx
 447         andq $12, %rbx
 448
 449         cmp %rax,%rbx
 450         jne .AGA_ENDLOOP
 451
 452         cmp $0, %rax
 453         jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
 454
 455 #; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
 456 .AGA_ALIGNLOOP:
 457
 458         movss (%rdi), %xmm0 #; buf => xmm0
 459         movss (%rsi), %xmm1 #; gain value => xmm1
 460         mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
 461         movss %xmm0, (%rdi) #; signal with gain => buf
 462
 463         decq %rdx
 464         jz .AGA_END
 465
 466         addq $4, %rdi #; buf++
 467         addq $4, %rsi #; gab++
 468
 469         addq $4, %rax
 470         cmp $16, %rax
 471         jne .AGA_ALIGNLOOP
 472
 473 #; There are frames left for sure, as that is checked in the beginning
 474 #; and within the previous loop. BUT, there might be less than 4 frames
 475 #; to process
 476
 477 .AGA_SSE:
 478         movq %rdx, %rax #; nframes => %rax
 479         shr $2, %rax #; unsigned divide by 4
 480
 481         cmp $0, %rax
 482         je .AGA_ENDLOOP
 483
 484 .AGA_SSELOOP:
 485         movaps (%rdi), %xmm0
 486         movaps (%rsi), %xmm1
 487         mulps %xmm1, %xmm0
 488         movaps %xmm0, (%rdi)
 489
 490         addq $16, %rdi
 491         addq $16, %rsi
 492
 493         decq %rax
 494         jnz .AGA_SSELOOP
 495
 496         andq $3, %rdx #; Remaining frames are nframes & 3
 497         jz .AGA_END
 498
 499
 500 #; Inside this loop, we know there are frames left to process
 501 #; but because either there are < 4 frames left, or the buffers
 502 #; are not aligned, we can't use the parallel SSE ops
 503 .AGA_ENDLOOP:
 504         movss (%rdi), %xmm0 #; buf => xmm0
 505         movss (%rsi), %xmm1 #; gain value => xmm1
 506         mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
 507         movss %xmm0, (%rdi) #; signal with gain => buf
 508
 509         addq $4,%rdi
 510         addq $4,%rsi
 511         decq %rdx #; nframes--
 512         jnz .AGA_ENDLOOP
 513
 514 .AGA_END:
 515
 516         popq %rsi
 517         popq %rdi
 518         popq %rdx
 519         popq %rcx
 520         popq %rbx
 521
 522         leave
 523         ret
 524
 525 #; end proc
 526
 527
 528 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
 529
 530 .globl x86_sse_compute_peak
 531         .def    x86_sse_compute_peak; .scl    2;   .type   32;
 532 .endef
 533
 534
 535 x86_sse_compute_peak:
 536
 537 #; due to Microsoft calling convention
 538 #; %rcx float*          buf     32(%rbp)
 539 #; %rdx unsigned int    nframes
 540 #; %xmm2 float                  current
 541 #; %xmm1 float                  buf[0]
 542
 543 #; due to System V AMD64 (Linux) calling convention
 544 #; %rdi  float*         buf     32(%rbp)
 545 #; %rsi  unsigned int   nframes
 546 #; %xmm0 float                  current
 547 #; %xmm1 float                  buf[0]
 548
 549         pushq %rbp
 550         movq %rsp, %rbp
 551
 552         #; save registers
 553         pushq %rcx
 554         pushq %rdi #; must be preserved
 555         pushq %rsi #; must be preserved
 556
 557         #; to keep algorithms universal - move input params into Linux specific registers
 558         movq %rcx, %rdi
 559         movq %rdx, %rsi
 560         movss %xmm2, %xmm0
 561
 562         #; if nframes == 0, go to end
 563         movq %rsi, %rcx #; nframes
 564         cmp     $0, %rcx
 565         je      .CP_END
 566
 567         #; create the "abs" mask in %xmm2
 568         pushq   $2147483647
 569         movss   (%rsp), %xmm2
 570         addq    $8, %rsp
 571         shufps  $0x00, %xmm2, %xmm2
 572
 573         #; Check for alignment
 574
 575         #;movq 8(%rbp), %rdi #; buf
 576         movq %rdi, %rdx #; buf => %rdx
 577         andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 578         jz      .CP_SSE #; if buffer IS aligned
 579
 580         #; PRE-LOOP
 581         #; we iterate 1-3 times, doing normal x87 float comparison
 582         #; so we reach a 16 byte aligned "buf" (=%rdi) value
 583
 584 .LP_START:
 585
 586         #; Load next value from the buffer
 587         movss (%rdi), %xmm1
 588         andps %xmm2, %xmm1
 589         maxss %xmm1, %xmm0
 590
 591         #; increment buffer, decrement counter
 592         addq $4, %rdi #; buf++;
 593
 594         decq %rcx   #; nframes--
 595         jz      .CP_END #; if we run out of frames, we go to the end
 596
 597         addq $4, %rdx #; one non-aligned byte less
 598         cmp $16, %rdx
 599         jne .LP_START #; if more non-aligned frames exist, we do a do-over
 600
 601 .CP_SSE:
 602
 603         #; We have reached the 16 byte aligned "buf" ("rdi") value
 604
 605         #; Figure out how many loops we should do
 606         movq %rcx, %rax #; copy remaining nframes to %rax for division
 607
 608         shr $2,%rax #; unsigned divide by 4
 609         jz .POST_START
 610
 611         #; %rax = SSE iterations
 612
 613         #; current maximum is at %xmm0, but we need to ..
 614         shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
 615
 616         #;prefetcht0 16(%rdi)
 617
 618 .LP_SSE:
 619
 620         movaps (%rdi), %xmm1
 621         andps %xmm2, %xmm1
 622         maxps %xmm1, %xmm0
 623
 624         addq $16, %rdi
 625
 626         subq $4, %rcx #; nframes-=4
 627
 628         decq %rax
 629         jnz .LP_SSE
 630
 631         #; Calculate the maximum value contained in the 4 FP's in %xmm0
 632         movaps %xmm0, %xmm1
 633         shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
 634         maxps  %xmm1, %xmm0 #; maximums of the two pairs
 635         movaps %xmm0, %xmm1
 636         shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
 637         maxps  %xmm1, %xmm0
 638
 639         #; now every float in %xmm0 is the same value, current maximum value
 640
 641         #; Next we need to post-process all remaining frames
 642         #; the remaining frame count is in %rcx
 643
 644         #; if no remaining frames, jump to the end
 645
 646         andq $3, %rcx #; nframes % 4
 647         jz .CP_END
 648
 649 .POST_START:
 650
 651         movss (%rdi), %xmm1
 652         andps %xmm2, %xmm1
 653         maxss %xmm1, %xmm0
 654
 655         addq $4, %rdi   #; buf++;
 656
 657         decq %rcx               #; nframes--;
 658         jnz .POST_START
 659
 660 .CP_END:
 661
 662         #; restore registers
 663         popq %rsi
 664         popq %rdi
 665         popq %rcx
 666
 667         #; return value is in xmm0
 668
 669         #; return
 670         leave
 671         ret
 672
 673 #; end proc