libs/ardour/sse_functions_64bit_win.s

   1 /*
   2     Copyright (C) 2005-2006 Paul Davis, John Rigg
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17
  18         Author: Sampo Savolainen
  19         64-bit conversion: John Rigg
  20
  21     $Id$
  22 */
  23
  24 #; Microsoft version of SSE sample processing functions
  25
  26 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
  27
  28 .globl x86_sse_mix_buffers_with_gain
  29         .def    x86_sse_mix_buffers_with_gain; .scl    2;      .type   32;
  30 .endef
  31
  32 x86_sse_mix_buffers_with_gain:
  33
  34 #; due to Microsoft calling convention
  35 #; %rcx float *dst
  36 #; %rdx float *src
  37 #; %r8 unsigned int nframes
  38 #; %xmm3 float  gain
  39
  40 #; due to System V AMD64 (Linux) calling convention
  41 #; %rdi float   *dst
  42 #; %rsi float   *src
  43 #; %rdx unsigned int nframes
  44 #; %xmm0 float  gain
  45
  46         pushq %rbp
  47         movq %rsp, %rbp
  48
  49         #; save the registers
  50         pushq %rbx #; must be preserved
  51         pushq %rcx
  52         pushq %rdx
  53         pushq %rdi #; must be preserved
  54         pushq %rsi #; must be preserved
  55
  56         #; to keep algorithms universal - move input params into Linux specific registers
  57         movq %rcx, %rdi
  58         movq %rdx, %rsi
  59         movq %r8, %rdx
  60         movss %xmm3, %xmm0
  61
  62         #; if nframes == 0, go to end
  63         cmp     $0, %rdx
  64         je      .MBWG_END
  65
  66         #; Check for alignment
  67
  68         movq %rdi, %rax
  69         andq $12, %rax #; mask alignment offset
  70
  71         movq %rsi, %rbx
  72         andq $12, %rbx #; mask alignment offset
  73
  74         cmp %rax, %rbx
  75         jne .MBWG_NONALIGN #; if not aligned, calculate manually
  76
  77         #; if we are aligned
  78         cmp $0, %rbx
  79         jz .MBWG_SSE
  80
  81         #; Pre-loop, we need to run 1-3 frames "manually" without
  82         #; SSE instructions
  83
  84 .MBWG_PRELOOP:
  85
  86         #; gain is already in %xmm0
  87         movss (%rsi), %xmm1
  88         mulss %xmm0, %xmm1
  89         addss (%rdi), %xmm1
  90         movss %xmm1, (%rdi)
  91
  92         addq $4, %rdi #; dst++
  93         addq $4, %rsi #; src++
  94         decq %rdx         #; nframes--
  95         jz .MBWG_END
  96
  97         addq $4, %rbx
  98
  99         cmp $16, %rbx #; test if we've reached 16 byte alignment
 100         jne .MBWG_PRELOOP
 101
 102
 103 .MBWG_SSE:
 104
 105         cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
 106         jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
 107
 108         #; gain is already in %xmm0
 109         shufps  $0x00, %xmm0, %xmm0
 110
 111
 112 .MBWG_SSELOOP:
 113
 114         movaps  (%rsi), %xmm1 #; source => xmm0
 115         mulps   %xmm0,  %xmm1 #; apply gain to source
 116         addps   (%rdi), %xmm1 #; mix with destination
 117         movaps  %xmm1, (%rdi) #; copy result to destination
 118
 119         addq $16, %rdi #; dst+=4
 120         addq $16, %rsi #; src+=4
 121
 122         subq $4, %rdx #; nframes-=4
 123         cmp $4, %rdx
 124         jge .MBWG_SSELOOP
 125
 126         cmp $0, %rdx
 127         je .MBWG_END
 128
 129         #; if there are remaining frames, the nonalign code will do nicely
 130         #; for the rest 1-3 frames.
 131
 132 .MBWG_NONALIGN:
 133         #; not aligned!
 134
 135         #; gain is already in %xmm0
 136
 137 .MBWG_NONALIGNLOOP:
 138
 139         movss (%rsi), %xmm1
 140         mulss %xmm0, %xmm1
 141         addss (%rdi), %xmm1
 142         movss %xmm1, (%rdi)
 143
 144         addq $4, %rdi
 145         addq $4, %rsi
 146
 147         decq %rdx
 148         jnz .MBWG_NONALIGNLOOP
 149
 150 .MBWG_END:
 151
 152         popq %rsi
 153         popq %rdi
 154         popq %rdx
 155         popq %rcx
 156         popq %rbx
 157
 158         #; return
 159         leave
 160         ret
 161
 162
 163 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
 164
 165 .globl x86_sse_mix_buffers_no_gain
 166         .def    x86_sse_mix_buffers_no_gain; .scl    2;   .type   32;
 167 .endef
 168
 169 x86_sse_mix_buffers_no_gain:
 170
 171 #; due to Microsoft calling convention
 172 #; %rcx float *dst
 173 #; %rdx float *src
 174 #; %r8 unsigned int nframes
 175
 176 #; due to System V AMD64 (Linux) calling convention
 177 #; %rdi float *dst
 178 #; %rsi float *src
 179 #; %rdx unsigned int nframes
 180
 181         pushq %rbp
 182         movq %rsp, %rbp
 183
 184         #; save the registers
 185         pushq %rbx #; must be preserved
 186         pushq %rcx
 187         pushq %rdx
 188         pushq %rdi #; must be preserved
 189         pushq %rsi #; must be preserved
 190
 191         #; to keep algorithms universal - move input params into Linux specific registers
 192         movq %rcx, %rdi
 193         movq %rdx, %rsi
 194         movq %r8, %rdx
 195
 196         #; the real function
 197
 198         #; if nframes == 0, go to end
 199         cmp     $0, %r8
 200         je      .MBNG_END
 201
 202         #; Check for alignment
 203
 204         movq %rdi, %rax
 205         andq $12, %rax #; mask alignment offset
 206
 207         movq %rsi, %rbx
 208         andq $12, %rbx #; mask alignment offset
 209
 210         cmp %rax, %rbx
 211         jne .MBNG_NONALIGN #; if not aligned, calculate manually
 212
 213         cmp $0, %rbx
 214         je .MBNG_SSE
 215
 216         #; Pre-loop, we need to run 1-3 frames "manually" without
 217         #; SSE instructions
 218
 219 .MBNG_PRELOOP:
 220
 221         movss (%rsi), %xmm0
 222         addss (%rdi), %xmm0
 223         movss %xmm0, (%rdi)
 224
 225         addq $4, %rdi #; dst++
 226         addq $4, %rsi #; src++
 227         decq %rdx         #; nframes--
 228         jz      .MBNG_END
 229         addq $4, %rbx
 230
 231         cmp $16, %rbx #; test if we've reached 16 byte alignment
 232         jne .MBNG_PRELOOP
 233
 234 .MBNG_SSE:
 235
 236         cmp $4, %rdx #; if there are frames left, but less than 4
 237         jnge .MBNG_NONALIGN #; we can't run SSE
 238
 239 .MBNG_SSELOOP:
 240
 241         movaps  (%rsi), %xmm0 #; source => xmm0
 242         addps   (%rdi), %xmm0 #; mix with destination
 243         movaps  %xmm0, (%rdi) #; copy result to destination
 244
 245         addq $16, %rdi #; dst+=4
 246         addq $16, %rsi #; src+=4
 247
 248         subq $4, %rdx #; nframes-=4
 249         cmp $4, %rdx
 250         jge .MBNG_SSELOOP
 251
 252         cmp $0, %rdx
 253         je .MBNG_END
 254
 255         #; if there are remaining frames, the nonalign code will do nicely
 256         #; for the rest 1-3 frames.
 257
 258 .MBNG_NONALIGN:
 259         #; not aligned!
 260
 261         movss (%rsi), %xmm0 #; src => xmm0
 262         addss (%rdi), %xmm0 #; xmm0 += dst
 263         movss %xmm0, (%rdi) #; xmm0 => dst
 264
 265         addq $4, %rdi
 266         addq $4, %rsi
 267
 268         decq %rdx
 269         jnz .MBNG_NONALIGN
 270
 271 .MBNG_END:
 272
 273         popq %rsi
 274         popq %rdi
 275         popq %rdx
 276         popq %rcx
 277         popq %rbx
 278
 279         #; return
 280         leave
 281         ret
 282
 283
 284 #; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
 285
 286 .globl x86_sse_apply_gain_to_buffer
 287         .def    x86_sse_apply_gain_to_buffer; .scl    2;   .type   32;
 288 .endef
 289
 290 x86_sse_apply_gain_to_buffer:
 291
 292 #; due to Microsoft calling convention
 293 #; %rcx float                   *buf    32(%rbp)
 294 #; %rdx unsigned int    nframes
 295 #; %xmm2 float                  gain
 296 #; %xmm1 float                  buf[0]
 297
 298 #; due to System V AMD64 (Linux) calling convention
 299 #; %rdi  float                  *buf    32(%rbp)
 300 #; %rsi  unsigned int   nframes
 301 #; %xmm0 float                  gain
 302 #; %xmm1 float                  buf[0]
 303
 304         pushq %rbp
 305         movq %rsp, %rbp
 306
 307         #; save the registers
 308         pushq %rcx
 309         pushq %rdi #; must be preserved
 310         pushq %rsi #; must be preserved
 311
 312         #; to keep algorithms universal - move input params into Linux specific registers
 313         movq %rcx, %rdi
 314         movq %rdx, %rsi
 315         movss %xmm2, %xmm0
 316
 317         #; the real function
 318
 319         #; if nframes == 0, go to end
 320         movq %rsi, %rcx #; nframes
 321         cmp     $0, %rcx
 322         je      .AG_END
 323
 324         #; set up the gain buffer (gain is already in %xmm0)
 325         shufps  $0x00, %xmm0, %xmm0
 326
 327         #; Check for alignment
 328
 329         movq %rdi, %rdx #; buf => %rdx
 330         andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 331         jz      .AG_SSE #; if buffer IS aligned
 332
 333         #; PRE-LOOP
 334         #; we iterate 1-3 times, doing normal x87 float comparison
 335         #; so we reach a 16 byte aligned "buf" (=%rdi) value
 336
 337 .AGLP_START:
 338
 339         #; Load next value from the buffer into %xmm1
 340         movss (%rdi), %xmm1
 341         mulss %xmm0, %xmm1
 342         movss %xmm1, (%rdi)
 343
 344         #; increment buffer, decrement counter
 345         addq $4, %rdi #; buf++;
 346
 347         decq %rcx   #; nframes--
 348         jz      .AG_END #; if we run out of frames, we go to the end
 349
 350         addq $4, %rdx #; one non-aligned byte less
 351         cmp $16, %rdx
 352         jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
 353
 354 .AG_SSE:
 355
 356         #; We have reached the 16 byte aligned "buf" ("rdi") value
 357
 358         #; Figure out how many loops we should do
 359         movq %rcx, %rax #; copy remaining nframes to %rax for division
 360
 361         shr $2,%rax #; unsigned divide by 4
 362
 363         #; %rax = SSE iterations
 364         cmp $0, %rax
 365         je .AGPOST_START
 366
 367 .AGLP_SSE:
 368
 369         movaps (%rdi), %xmm1
 370         mulps %xmm0, %xmm1
 371         movaps %xmm1, (%rdi)
 372
 373         addq $16, %rdi  #; buf + 4
 374         subq $4, %rcx   #; nframes-=4
 375
 376         decq %rax
 377         jnz .AGLP_SSE
 378
 379         #; Next we need to post-process all remaining frames
 380         #; the remaining frame count is in %rcx
 381
 382         andq $3, %rcx #; nframes % 4
 383         jz .AG_END
 384
 385 .AGPOST_START:
 386
 387         movss (%rdi), %xmm1
 388         mulss %xmm0, %xmm1
 389         movss %xmm1, (%rdi)
 390
 391         #; increment buffer, decrement counter
 392         addq $4, %rdi #; buf++;
 393
 394         decq %rcx   #; nframes--
 395         jnz     .AGPOST_START #; if we run out of frames, we go to the end
 396
 397 .AG_END:
 398
 399         popq %rsi
 400         popq %rdi
 401         popq %rcx
 402
 403         #; return
 404         leave
 405         ret
 406
 407 #; end proc
 408
 409
 410 #; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
 411
 412 .globl x86_sse_apply_gain_vector
 413         .def    x86_sse_apply_gain_vector; .scl    2;   .type   32;
 414 .endef
 415
 416
 417 x86_sse_apply_gain_vector:
 418
 419 #; due to Microsoft calling convention
 420 #; %rcx float *buf
 421 #; %rdx float *gain_vector
 422 #; %r8  unsigned int nframes
 423
 424 #; due to System V AMD64 (Linux) calling convention
 425 #; %rdi float *buf
 426 #; %rsi float *gain_vector
 427 #; %rdx unsigned int nframes
 428
 429         pushq %rbp
 430         movq %rsp, %rbp
 431
 432         #; save the registers
 433         pushq %rbx #; must be preserved
 434         pushq %rcx
 435         pushq %rdx
 436         pushq %rdi #; must be preserved
 437         pushq %rsi #; must be preserved
 438
 439         #; to keep algorithms universal - move input params into Linux specific registers
 440         movq %rcx, %rdi
 441         movq %rdx, %rsi
 442         movq %r8, %rdx
 443
 444         #; if nframes == 0 go to end
 445         cmp $0, %rdx
 446         je .AGA_END
 447
 448         #; Check alignment
 449         movq %rdi, %rax
 450         andq $12, %rax
 451
 452         movq %rsi, %rbx
 453         andq $12, %rbx
 454
 455         cmp %rax,%rbx
 456         jne .AGA_ENDLOOP
 457
 458         cmp $0, %rax
 459         jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
 460
 461 #; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
 462 .AGA_ALIGNLOOP:
 463
 464         movss (%rdi), %xmm0 #; buf => xmm0
 465         movss (%rsi), %xmm1 #; gain value => xmm1
 466         mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
 467         movss %xmm0, (%rdi) #; signal with gain => buf
 468
 469         decq %rdx
 470         jz .AGA_END
 471
 472         addq $4, %rdi #; buf++
 473         addq $4, %rsi #; gab++
 474
 475         addq $4, %rax
 476         cmp $16, %rax
 477         jne .AGA_ALIGNLOOP
 478
 479 #; There are frames left for sure, as that is checked in the beginning
 480 #; and within the previous loop. BUT, there might be less than 4 frames
 481 #; to process
 482
 483 .AGA_SSE:
 484         movq %rdx, %rax #; nframes => %rax
 485         shr $2, %rax #; unsigned divide by 4
 486
 487         cmp $0, %rax
 488         je .AGA_ENDLOOP
 489
 490 .AGA_SSELOOP:
 491         movaps (%rdi), %xmm0
 492         movaps (%rsi), %xmm1
 493         mulps %xmm1, %xmm0
 494         movaps %xmm0, (%rdi)
 495
 496         addq $16, %rdi
 497         addq $16, %rsi
 498
 499         decq %rax
 500         jnz .AGA_SSELOOP
 501
 502         andq $3, %rdx #; Remaining frames are nframes & 3
 503         jz .AGA_END
 504
 505
 506 #; Inside this loop, we know there are frames left to process
 507 #; but because either there are < 4 frames left, or the buffers
 508 #; are not aligned, we can't use the parallel SSE ops
 509 .AGA_ENDLOOP:
 510         movss (%rdi), %xmm0 #; buf => xmm0
 511         movss (%rsi), %xmm1 #; gain value => xmm1
 512         mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
 513         movss %xmm0, (%rdi) #; signal with gain => buf
 514
 515         addq $4,%rdi
 516         addq $4,%rsi
 517         decq %rdx #; nframes--
 518         jnz .AGA_ENDLOOP
 519
 520 .AGA_END:
 521
 522         popq %rsi
 523         popq %rdi
 524         popq %rdx
 525         popq %rcx
 526         popq %rbx
 527
 528         leave
 529         ret
 530
 531 #; end proc
 532
 533
 534 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
 535
 536 .globl x86_sse_compute_peak
 537         .def    x86_sse_compute_peak; .scl    2;   .type   32;
 538 .endef
 539
 540
 541 x86_sse_compute_peak:
 542
 543 #; due to Microsoft calling convention
 544 #; %rcx float*          buf     32(%rbp)
 545 #; %rdx unsigned int    nframes
 546 #; %xmm2 float                  current
 547 #; %xmm1 float                  buf[0]
 548
 549 #; due to System V AMD64 (Linux) calling convention
 550 #; %rdi  float*         buf     32(%rbp)
 551 #; %rsi  unsigned int   nframes
 552 #; %xmm0 float                  current
 553 #; %xmm1 float                  buf[0]
 554
 555         pushq %rbp
 556         movq %rsp, %rbp
 557
 558         #; save registers
 559         pushq %rcx
 560         pushq %rdi #; must be preserved
 561         pushq %rsi #; must be preserved
 562
 563         #; to keep algorithms universal - move input params into Linux specific registers
 564         movq %rcx, %rdi
 565         movq %rdx, %rsi
 566         movss %xmm2, %xmm0
 567
 568         #; if nframes == 0, go to end
 569         movq %rsi, %rcx #; nframes
 570         cmp     $0, %rcx
 571         je      .CP_END
 572
 573         #; create the "abs" mask in %xmm2
 574         pushq   $2147483647
 575         movss   (%rsp), %xmm2
 576         addq    $8, %rsp
 577         shufps  $0x00, %xmm2, %xmm2
 578
 579         #; Check for alignment
 580
 581         #;movq 8(%rbp), %rdi #; buf
 582         movq %rdi, %rdx #; buf => %rdx
 583         andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 584         jz      .CP_SSE #; if buffer IS aligned
 585
 586         #; PRE-LOOP
 587         #; we iterate 1-3 times, doing normal x87 float comparison
 588         #; so we reach a 16 byte aligned "buf" (=%rdi) value
 589
 590 .LP_START:
 591
 592         #; Load next value from the buffer
 593         movss (%rdi), %xmm1
 594         andps %xmm2, %xmm1
 595         maxss %xmm1, %xmm0
 596
 597         #; increment buffer, decrement counter
 598         addq $4, %rdi #; buf++;
 599
 600         decq %rcx   #; nframes--
 601         jz      .CP_END #; if we run out of frames, we go to the end
 602
 603         addq $4, %rdx #; one non-aligned byte less
 604         cmp $16, %rdx
 605         jne .LP_START #; if more non-aligned frames exist, we do a do-over
 606
 607 .CP_SSE:
 608
 609         #; We have reached the 16 byte aligned "buf" ("rdi") value
 610
 611         #; Figure out how many loops we should do
 612         movq %rcx, %rax #; copy remaining nframes to %rax for division
 613
 614         shr $2,%rax #; unsigned divide by 4
 615         jz .POST_START
 616
 617         #; %rax = SSE iterations
 618
 619         #; current maximum is at %xmm0, but we need to ..
 620         shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
 621
 622         #;prefetcht0 16(%rdi)
 623
 624 .LP_SSE:
 625
 626         movaps (%rdi), %xmm1
 627         andps %xmm2, %xmm1
 628         maxps %xmm1, %xmm0
 629
 630         addq $16, %rdi
 631
 632         subq $4, %rcx #; nframes-=4
 633
 634         decq %rax
 635         jnz .LP_SSE
 636
 637         #; Calculate the maximum value contained in the 4 FP's in %xmm0
 638         movaps %xmm0, %xmm1
 639         shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
 640         maxps  %xmm1, %xmm0 #; maximums of the two pairs
 641         movaps %xmm0, %xmm1
 642         shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
 643         maxps  %xmm1, %xmm0
 644
 645         #; now every float in %xmm0 is the same value, current maximum value
 646
 647         #; Next we need to post-process all remaining frames
 648         #; the remaining frame count is in %rcx
 649
 650         #; if no remaining frames, jump to the end
 651
 652         andq $3, %rcx #; nframes % 4
 653         jz .CP_END
 654
 655 .POST_START:
 656
 657         movss (%rdi), %xmm1
 658         andps %xmm2, %xmm1
 659         maxss %xmm1, %xmm0
 660
 661         addq $4, %rdi   #; buf++;
 662
 663         decq %rcx               #; nframes--;
 664         jnz .POST_START
 665
 666 .CP_END:
 667
 668         #; restore registers
 669         popq %rsi
 670         popq %rdi
 671         popq %rcx
 672
 673         #; return value is in xmm0
 674
 675         #; return
 676         leave
 677         ret
 678
 679 #; end proc