libs/ardour/sse_avx_functions_64bit_win.s

   1 /*
   2  * Copyright (C) 2015 Paul Davis <paul@linuxaudiosystems.com>
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License along
  15  * with this program; if not, write to the Free Software Foundation, Inc.,
  16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  17  */
  18
  19 #; Microsoft version of AVX sample processing functions
  20
  21 #; void x86_sse_avx_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
  22
  23 .globl x86_sse_avx_mix_buffers_with_gain
  24         .def    x86_sse_avx_mix_buffers_with_gain; .scl    2;      .type   32;
  25 .endef
  26
  27 x86_sse_avx_mix_buffers_with_gain:
  28
  29 #; due to Microsoft calling convention
  30 #; %rcx float *dst
  31 #; %rdx float *src
  32 #; %r8 unsigned int nframes
  33 #; %xmm3 float  gain
  34
  35         pushq %rbp
  36         movq %rsp, %rbp
  37
  38         #; save the registers
  39         pushq %rbx #; must be preserved
  40
  41         #; move current max to %xmm0 for convenience
  42         movss %xmm3, %xmm0
  43
  44         #; if nframes == 0, go to end
  45         cmp     $0, %r8
  46         je      .MBWG_END
  47
  48         #; Check for alignment
  49
  50         movq %rcx, %rax
  51         andq $28, %rax #; mask alignment offset
  52
  53         movq %rdx, %rbx
  54         andq $28, %rbx #; mask alignment offset
  55
  56         cmp %rax, %rbx
  57         jne .MBWG_NONALIGN #; if buffer are not aligned between each other, calculate manually
  58
  59         #; if we are aligned
  60         cmp $0, %rbx
  61         jz .MBWG_AVX
  62
  63         #; Pre-loop, we need to run 1-7 frames "manually" without
  64         #; SSE instructions
  65
  66 .MBWG_PRELOOP:
  67
  68         #; gain is already in %xmm0
  69         movss (%rdx), %xmm1
  70         mulss %xmm0, %xmm1
  71         addss (%rcx), %xmm1
  72         movss %xmm1, (%rcx)
  73
  74         addq $4, %rcx #; dst++
  75         addq $4, %rdx #; src++
  76         decq %r8          #; nframes--
  77         jz .MBWG_END
  78
  79         addq $4, %rbx
  80
  81         cmp $32, %rbx #; test if we've reached 32 byte alignment
  82         jne .MBWG_PRELOOP
  83
  84 .MBWG_AVX:
  85
  86         cmp $8, %r8 #; we know it's not zero, but if it's not >=4, then
  87         jl .MBWG_NONALIGN #; we jump straight to the "normal" code
  88
  89         #; set up the gain buffer (gain is already in %xmm0)
  90         vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
  91         vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
  92
  93 .MBWG_AVXLOOP:
  94
  95         vmovaps (%rdx), %ymm1        #; source => xmm0
  96         vmulps  %ymm0,  %ymm1, %ymm2 #; apply gain to source
  97         vaddps  (%rcx), %ymm2, %ymm1 #; mix with destination
  98         vmovaps  %ymm1, (%rcx)        #; copy result to destination
  99
 100         addq $32, %rcx #; dst+=8
 101         addq $32, %rdx #; src+=8
 102
 103         subq $8, %r8 #; nframes-=8
 104         cmp $8, %r8
 105         jge .MBWG_AVXLOOP
 106
 107         #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
 108         vzeroupper
 109
 110         cmp $0, %r8
 111         je .MBWG_END
 112
 113         #; if there are remaining frames, the nonalign code will do nicely
 114         #; for the rest 1-7 frames.
 115
 116 .MBWG_NONALIGN:
 117         #; not aligned!
 118
 119         #; gain is already in %xmm0
 120
 121 .MBWG_NONALIGNLOOP:
 122
 123         movss (%rdx), %xmm1
 124         mulss %xmm0, %xmm1
 125         addss (%rcx), %xmm1
 126         movss %xmm1, (%rcx)
 127
 128         addq $4, %rcx
 129         addq $4, %rdx
 130
 131         decq %r8
 132         jnz .MBWG_NONALIGNLOOP
 133
 134 .MBWG_END:
 135
 136         popq %rbx
 137
 138         #; return
 139         leave
 140         ret
 141
 142
 143 #; void x86_sse_avx_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
 144
 145 .globl x86_sse_avx_mix_buffers_no_gain
 146         .def    x86_sse_avx_mix_buffers_no_gain; .scl    2;   .type   32;
 147 .endef
 148
 149 x86_sse_avx_mix_buffers_no_gain:
 150
 151 #; due to Microsoft calling convention
 152 #; %rcx float *dst
 153 #; %rdx float *src
 154 #; %r8 unsigned int nframes
 155
 156         pushq %rbp
 157         movq %rsp, %rbp
 158
 159         #; save the registers
 160         pushq %rbx #; must be preserved
 161
 162         #; the real function
 163
 164         #; if nframes == 0, go to end
 165         cmp     $0, %r8
 166         je      .MBNG_END
 167
 168         #; Check for alignment
 169
 170         movq %rcx, %rax
 171         andq $28, %rax #; mask alignment offset
 172
 173         movq %rdx, %rbx
 174         andq $28, %rbx #; mask alignment offset
 175
 176         cmp %rax, %rbx
 177         jne .MBNG_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
 178
 179         cmp $0, %rbx
 180         je .MBNG_AVX #; aligned at 32, rpoceed to AVX
 181
 182         #; Pre-loop, we need to run 1-7 frames "manually" without
 183         #; AVX instructions
 184
 185 .MBNG_PRELOOP:
 186
 187         movss (%rdx), %xmm0
 188         addss (%rcx), %xmm0
 189         movss %xmm0, (%rcx)
 190
 191         addq $4, %rcx #; dst++
 192         addq $4, %rdx #; src++
 193
 194         decq %r8          #; nframes--
 195         jz      .MBNG_END
 196
 197         addq $4, %rbx #; one non-aligned byte less
 198
 199         cmp $32, %rbx #; test if we've reached 32 byte alignment
 200         jne .MBNG_PRELOOP
 201
 202 .MBNG_AVX:
 203
 204         cmp $8, %r8 #; if there are frames left, but less than 8
 205         jl .MBNG_NONALIGN #; we can't run AVX
 206
 207 .MBNG_AVXLOOP:
 208
 209         vmovaps (%rdx), %ymm0        #; source => xmm0
 210         vaddps  (%rcx), %ymm0, %ymm1 #; mix with destination
 211         vmovaps  %ymm1, (%rcx)       #; copy result to destination
 212
 213         addq $32, %rcx #; dst+=8
 214         addq $32, %rdx #; src+=8
 215
 216         subq $8, %r8 #; nframes-=8
 217         cmp $8, %r8
 218         jge .MBNG_AVXLOOP
 219
 220         #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
 221         vzeroupper
 222
 223         cmp $0, %r8
 224         je .MBNG_END
 225
 226         #; if there are remaining frames, the nonalign code will do nicely
 227         #; for the rest 1-7 frames.
 228
 229 .MBNG_NONALIGN:
 230         #; not aligned!
 231         #;
 232
 233         movss (%rdx), %xmm0 #; src => xmm0
 234         addss (%rcx), %xmm0 #; xmm0 += dst
 235         movss %xmm0, (%rcx) #; xmm0 => dst
 236
 237         addq $4, %rcx
 238         addq $4, %rdx
 239
 240         decq %r8
 241         jnz .MBNG_NONALIGN
 242
 243 .MBNG_END:
 244
 245         popq %rbx
 246
 247         #; return
 248         leave
 249         ret
 250
 251
 252 #; void x86_sse_avx_copy_vector (float *dst, float *src, unsigned int nframes);
 253
 254 .globl x86_sse_avx_copy_vector
 255         .def    x86_sse_avx_copy_vector; .scl    2;   .type   32;
 256 .endef
 257
 258 x86_sse_avx_copy_vector:
 259
 260 #; due to Microsoft calling convention
 261 #; %rcx float *dst
 262 #; %rdx float *src
 263 #; %r8 unsigned int nframes
 264
 265         pushq %rbp
 266         movq %rsp, %rbp
 267
 268         #; save the registers
 269         pushq %rbx #; must be preserved
 270
 271         #; the real function
 272
 273         #; if nframes == 0, go to end
 274         cmp     $0, %r8
 275         je      .CB_END
 276
 277         #; Check for alignment
 278
 279         movq %rcx, %rax
 280         andq $28, %rax #; mask alignment offset
 281
 282         movq %rdx, %rbx
 283         andq $28, %rbx #; mask alignment offset
 284
 285         cmp %rax, %rbx
 286         jne .CB_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
 287
 288         cmp $0, %rbx
 289         je .CB_AVX #; aligned at 32, rpoceed to AVX
 290
 291         #; Pre-loop, we need to run 1-7 frames "manually" without
 292         #; AVX instructions
 293
 294 .CB_PRELOOP:
 295
 296         movss (%rdx), %xmm0
 297         movss %xmm0, (%rcx)
 298
 299         addq $4, %rcx #; dst++
 300         addq $4, %rdx #; src++
 301
 302         decq %r8          #; nframes--
 303         jz      .CB_END
 304
 305         addq $4, %rbx #; one non-aligned byte less
 306
 307         cmp $32, %rbx #; test if we've reached 32 byte alignment
 308         jne .CB_PRELOOP
 309
 310 .CB_AVX:
 311
 312         cmp $8, %r8 #; if there are frames left, but less than 8
 313         jl .CB_NONALIGN #; we can't run AVX
 314
 315 .CB_AVXLOOP:
 316
 317         vmovaps (%rdx), %ymm0        #; source => xmm0
 318         vmovaps  %ymm0, (%rcx)       #; copy result to destination
 319
 320         addq $32, %rcx #; dst+=8
 321         addq $32, %rdx #; src+=8
 322
 323         subq $8, %r8 #; nframes-=8
 324         cmp $8, %r8
 325         jge .CB_AVXLOOP
 326
 327         #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
 328         vzeroupper
 329
 330         cmp $0, %r8
 331         je .CB_END
 332
 333         #; if there are remaining frames, the nonalign code will do nicely
 334         #; for the rest 1-7 frames.
 335
 336 .CB_NONALIGN:
 337         #; not aligned!
 338         #;
 339
 340         movss (%rdx), %xmm0 #; src => xmm0
 341         movss %xmm0, (%rcx) #; xmm0 => dst
 342
 343         addq $4, %rcx
 344         addq $4, %rdx
 345
 346         decq %r8
 347         jnz .CB_NONALIGN
 348
 349 .CB_END:
 350
 351         popq %rbx
 352
 353         #; return
 354         leave
 355         ret
 356
 357
 358 #; void x86_sse_avx_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
 359
 360 .globl x86_sse_avx_apply_gain_to_buffer
 361         .def    x86_sse_avx_apply_gain_to_buffer; .scl    2;   .type   32;
 362 .endef
 363
 364 x86_sse_avx_apply_gain_to_buffer:
 365
 366 #; due to Microsoft calling convention
 367 #; %rcx float                   *buf    32(%rbp)
 368 #; %rdx unsigned int    nframes
 369 #; %xmm2 float                  gain                    avx specific register
 370
 371         pushq %rbp
 372         movq %rsp, %rbp
 373
 374         #; move current max to %xmm0 for convenience
 375         movss %xmm2, %xmm0
 376
 377         #; the real function
 378
 379         #; if nframes == 0, go to end
 380         cmp     $0, %rdx
 381         je      .AG_END
 382
 383         #; Check for alignment
 384
 385         movq %rcx, %r8 #; buf => %rdx
 386         andq $28, %r8 #; check alignment with mask 11100
 387         jz      .AG_AVX #; if buffer IS aligned
 388
 389         #; PRE-LOOP
 390         #; we iterate 1-7 times, doing normal x87 float comparison
 391         #; so we reach a 32 byte aligned "buf" (=%rdi) value
 392
 393 .AGLP_START:
 394
 395         #; Load next value from the buffer into %xmm1
 396         movss (%rcx), %xmm1
 397         mulss %xmm0, %xmm1
 398         movss %xmm1, (%rcx)
 399
 400         #; increment buffer, decrement counter
 401         addq $4, %rcx #; buf++;
 402
 403         decq %rdx   #; nframes--
 404         jz      .AG_END #; if we run out of frames, we go to the end
 405
 406         addq $4, %r8 #; one non-aligned byte less
 407         cmp $16, %r8
 408         jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
 409
 410 .AG_AVX:
 411
 412         #; We have reached the 32 byte aligned "buf" ("rcx") value
 413         #; use AVX instructions
 414
 415         #; Figure out how many loops we should do
 416         movq %rdx, %rax #; copy remaining nframes to %rax for division
 417
 418         shr $3, %rax #; unsigned divide by 8
 419
 420         #; %rax = AVX iterations
 421         cmp $0, %rax
 422         je .AGPOST_START
 423
 424         #; set up the gain buffer (gain is already in %xmm0)
 425         vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
 426         vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
 427
 428 .AGLP_AVX:
 429
 430         vmovaps (%rcx), %ymm1
 431         vmulps %ymm0, %ymm1, %ymm2
 432         vmovaps %ymm2, (%rcx)
 433
 434         addq $32, %rcx  #; buf + 8
 435         subq $8, %rdx   #; nframes-=8
 436
 437         decq %rax
 438         jnz .AGLP_AVX
 439
 440         #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
 441         vzeroupper
 442
 443         #; Next we need to post-process all remaining frames
 444         #; the remaining frame count is in %rcx
 445         cmpq $0, %rdx #;
 446         jz .AG_END
 447
 448 .AGPOST_START:
 449
 450         movss (%rcx), %xmm1
 451         mulss %xmm0, %xmm1
 452         movss %xmm1, (%rcx)
 453
 454         #; increment buffer, decrement counter
 455         addq $4, %rcx #; buf++;
 456
 457         decq %rdx   #; nframes--
 458         jnz     .AGPOST_START #; if we run out of frames, we go to the end
 459
 460 .AG_END:
 461
 462         #; return
 463         leave
 464         ret
 465
 466 #; end proc
 467
 468
 469 #; float x86_sse_avx_compute_peak(float *buf, long nframes, float current);
 470
 471 .globl x86_sse_avx_compute_peak
 472         .def    x86_sse_avx_compute_peak; .scl    2;   .type   32;
 473 .endef
 474
 475 x86_sse_avx_compute_peak:
 476
 477 #; due to Microsoft calling convention
 478 #; %rcx float*          buf     32(%rbp)
 479 #; %rdx unsigned int    nframes
 480 #; %xmm2 float                  current
 481
 482         pushq %rbp
 483         movq %rsp, %rbp
 484
 485         #; move current max to %xmm0 for convenience
 486         movss %xmm2, %xmm0
 487
 488         #; if nframes == 0, go to end
 489         cmp     $0, %rdx
 490         je      .CP_END
 491
 492         #; create the "abs" mask in %xmm3
 493         #; if will be used to discard sign bit
 494         pushq   $2147483647
 495         movss   (%rsp), %xmm3
 496         addq    $8, %rsp
 497
 498         #; Check for alignment
 499         movq %rcx, %r8 #; buf => %rdx
 500         andq $28, %r8 #; mask bits 1 & 2
 501         jz      .CP_AVX #; if buffer IS aligned
 502
 503         #; PRE-LOOP
 504         #; we iterate 1-7 times, doing normal x87 float comparison
 505         #; so we reach a 32 byte aligned "buf" (=%rcx) value
 506
 507 .LP_START:
 508
 509         #; Load next value from the buffer
 510         movss (%rcx), %xmm1
 511         andps %xmm3, %xmm1      #; mask out sign bit
 512         maxss %xmm1, %xmm0
 513
 514         #; increment buffer, decrement counter
 515         addq $4, %rcx #; buf++;
 516
 517         decq %rdx   #; nframes--
 518         jz      .CP_END #; if we run out of frames, we go to the end
 519
 520         addq $4, %r8 #; one non-aligned byte less
 521         cmp $32, %r8
 522         jne .LP_START #; if more non-aligned frames exist, we do a do-over
 523
 524 .CP_AVX:
 525
 526         #; We have reached the 32 byte aligned "buf" ("rdi") value
 527
 528         #; Figure out how many loops we should do
 529         movq %rdx, %rax #; copy remaining nframes to %rax for division
 530
 531         shr $3, %rax #; unsigned divide by 8
 532         jz .POST_START
 533
 534         #; %rax = AVX iterations
 535
 536         #; current maximum is at %xmm0, but we need to broadcast it to the whole ymm0 register..
 537         vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the all 128 bits of xmm0 register
 538         vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
 539
 540         #; broadcast sign mask to the whole ymm3 register
 541         vshufps $0x00, %ymm3, %ymm3, %ymm3 #; spread single float value to the all 128 bits of xmm3 register
 542         vperm2f128 $0x00, %ymm3, %ymm3, %ymm3 #; extend the first 128 bits of ymm3 register to higher 128 bits
 543
 544 .LP_AVX:
 545
 546         vmovaps (%rcx), %ymm1
 547         vandps %ymm3, %ymm1, %ymm1      #; mask out sign bit
 548         vmaxps %ymm1, %ymm0, %ymm0
 549
 550         addq $32, %rcx #; buf+=8
 551         subq $8, %rdx #; nframes-=8
 552
 553         decq %rax
 554         jnz .LP_AVX
 555
 556         #; Calculate the maximum value contained in the 4 FP's in %ymm0
 557         vshufps $0x4e, %ymm0, %ymm0, %ymm1     #; shuffle left & right pairs (1234 => 3412) in each 128 bit half
 558         vmaxps  %ymm1, %ymm0, %ymm0            #; maximums of the four pairs, if each of 8 elements was unique, 4 unique elements left now
 559         vshufps $0xb1, %ymm0, %ymm0, %ymm1     #; shuffle the floats inside pairs (1234 => 2143) in each 128 bit half
 560         vmaxps  %ymm1, %ymm0, %ymm0                        #; maximums of the four pairs, we had up to 4 unique elements was unique, 2 unique elements left now
 561         vperm2f128 $0x01, %ymm0, %ymm0, %ymm1  #; swap 128 bit halfs
 562         vmaxps  %ymm1, %ymm0, %ymm0                        #; the result will be - all 8 elemens are maximums
 563
 564         #; now every float in %ymm0 is the same value, current maximum value
 565
 566         #; Next we need to post-process all remaining frames
 567         #; the remaining frame count is in %rcx
 568
 569         #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
 570         vzeroupper
 571
 572         #; if no remaining frames, jump to the end
 573         cmp $0, %rdx
 574         je .CP_END
 575
 576 .POST_START:
 577
 578         movss (%rcx), %xmm1
 579         andps %xmm3, %xmm1      #; mask out sign bit
 580         maxss %xmm1, %xmm0
 581
 582         addq $4, %rcx   #; buf++;
 583
 584         decq %rdx               #; nframes--;
 585         jnz .POST_START
 586
 587 .CP_END:
 588
 589         #; return value is in xmm0
 590
 591         #; return
 592         leave
 593         ret
 594
 595 #; end proc