libs/ardour/sse_functions_64bit.s

   1 /*
   2     Copyright (C) 2005-2006 Sampo Savolainen, John Rigg
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17
  18     $Id$
  19 */
  20
  21
  22 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
  23
  24 .globl x86_sse_mix_buffers_with_gain
  25         .type   x86_sse_mix_buffers_with_gain,@function
  26
  27 x86_sse_mix_buffers_with_gain:
  28
  29 #; %rdi float   *dst
  30 #; %rsi float   *src
  31 #; %rdx unsigned int nframes
  32 #; %xmm0 float  gain
  33
  34         pushq %rbp
  35         movq %rsp, %rbp
  36
  37         #; save the registers
  38         pushq %rbx
  39         pushq %rdi
  40         pushq %rsi
  41
  42         #; if nframes == 0, go to end
  43         cmp     $0, %rdx
  44         je      .MBWG_END
  45
  46         #; Check for alignment
  47
  48         movq %rdi, %rax
  49         andq $12, %rax #; mask alignment offset
  50
  51         movq %rsi, %rbx
  52         andq $12, %rbx #; mask alignment offset
  53
  54         cmp %rax, %rbx
  55         jne .MBWG_NONALIGN #; if not aligned, calculate manually
  56
  57         #; if we are aligned
  58         cmp $0, %rbx
  59         jz .MBWG_SSE
  60
  61         #; Pre-loop, we need to run 1-3 frames "manually" without
  62         #; SSE instructions
  63
  64 .MBWG_PRELOOP:
  65
  66         #; gain is already in %xmm0
  67         movss (%rsi), %xmm1
  68         mulss %xmm0, %xmm1
  69         addss (%rdi), %xmm1
  70         movss %xmm1, (%rdi)
  71
  72         addq $4, %rdi #; dst++
  73         addq $4, %rsi #; src++
  74         decq %rdx         #; nframes--
  75         jz .MBWG_END
  76
  77         addq $4, %rbx
  78
  79         cmp $16, %rbx #; test if we've reached 16 byte alignment
  80         jne .MBWG_PRELOOP
  81
  82
  83 .MBWG_SSE:
  84
  85         cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
  86         jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
  87
  88         #; gain is already in %xmm0
  89         shufps  $0x00, %xmm0, %xmm0
  90
  91
  92 .MBWG_SSELOOP:
  93
  94         movaps  (%rsi), %xmm1 #; source => xmm0
  95         mulps   %xmm0,  %xmm1 #; apply gain to source
  96         addps   (%rdi), %xmm1 #; mix with destination
  97         movaps  %xmm1, (%rdi) #; copy result to destination
  98
  99         addq $16, %rdi #; dst+=4
 100         addq $16, %rsi #; src+=4
 101
 102         subq $4, %rdx #; nframes-=4
 103         cmp $4, %rdx
 104         jge .MBWG_SSELOOP
 105
 106         cmp $0, %rdx
 107         je .MBWG_END
 108
 109         #; if there are remaining frames, the nonalign code will do nicely
 110         #; for the rest 1-3 frames.
 111
 112 .MBWG_NONALIGN:
 113         #; not aligned!
 114
 115         #; gain is already in %xmm0
 116
 117 .MBWG_NONALIGNLOOP:
 118
 119         movss (%rsi), %xmm1
 120         mulss %xmm0, %xmm1
 121         addss (%rdi), %xmm1
 122         movss %xmm1, (%rdi)
 123
 124         addq $4, %rdi
 125         addq $4, %rsi
 126
 127         decq %rdx
 128         jnz .MBWG_NONALIGNLOOP
 129
 130 .MBWG_END:
 131
 132         popq %rsi
 133         popq %rdi
 134         popq %rbx
 135
 136         #; return
 137         leave
 138         ret
 139
 140 .size   x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
 141
 142
 143 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
 144
 145 .globl x86_sse_mix_buffers_no_gain
 146         .type   x86_sse_mix_buffers_no_gain,@function
 147
 148 x86_sse_mix_buffers_no_gain:
 149
 150 #; %rdi float *dst
 151 #; %rsi float *src
 152 #; %rdx unsigned int nframes
 153
 154         pushq %rbp
 155         movq %rsp, %rbp
 156
 157         #; save the registers
 158         pushq %rbx
 159         pushq %rdi
 160         pushq %rsi
 161
 162         #; the real function
 163
 164         #; if nframes == 0, go to end
 165         cmp     $0, %rdx
 166         je      .MBNG_END
 167
 168         #; Check for alignment
 169
 170         movq %rdi, %rax
 171         andq $12, %rax #; mask alignment offset
 172
 173         movq %rsi, %rbx
 174         andq $12, %rbx #; mask alignment offset
 175
 176         cmp %rax, %rbx
 177         jne .MBNG_NONALIGN #; if not aligned, calculate manually
 178
 179         cmp $0, %rbx
 180         je .MBNG_SSE
 181
 182         #; Pre-loop, we need to run 1-3 frames "manually" without
 183         #; SSE instructions
 184
 185 .MBNG_PRELOOP:
 186
 187         movss (%rsi), %xmm0
 188         addss (%rdi), %xmm0
 189         movss %xmm0, (%rdi)
 190
 191         addq $4, %rdi #; dst++
 192         addq $4, %rsi #; src++
 193         decq %rdx         #; nframes--
 194         jz      .MBNG_END
 195         addq $4, %rbx
 196
 197         cmp $16, %rbx #; test if we've reached 16 byte alignment
 198         jne .MBNG_PRELOOP
 199
 200 .MBNG_SSE:
 201
 202         cmp $4, %rdx #; if there are frames left, but less than 4
 203         jnge .MBNG_NONALIGN #; we can't run SSE
 204
 205 .MBNG_SSELOOP:
 206
 207         movaps  (%rsi), %xmm0 #; source => xmm0
 208         addps   (%rdi), %xmm0 #; mix with destination
 209         movaps  %xmm0, (%rdi) #; copy result to destination
 210
 211         addq $16, %rdi #; dst+=4
 212         addq $16, %rsi #; src+=4
 213
 214         subq $4, %rdx #; nframes-=4
 215         cmp $4, %rdx
 216         jge .MBNG_SSELOOP
 217
 218         cmp $0, %rdx
 219         je .MBNG_END
 220
 221         #; if there are remaining frames, the nonalign code will do nicely
 222         #; for the rest 1-3 frames.
 223
 224 .MBNG_NONALIGN:
 225         #; not aligned!
 226
 227         movss (%rsi), %xmm0 #; src => xmm0
 228         addss (%rdi), %xmm0 #; xmm0 += dst
 229         movss %xmm0, (%rdi) #; xmm0 => dst
 230
 231         addq $4, %rdi
 232         addq $4, %rsi
 233
 234         decq %rdx
 235         jnz .MBNG_NONALIGN
 236
 237 .MBNG_END:
 238
 239         popq %rsi
 240         popq %rdi
 241         popq %rbx
 242
 243         #; return
 244         leave
 245         ret
 246
 247 .size   x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
 248
 249
 250 #; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
 251
 252 .globl x86_sse_apply_gain_to_buffer
 253         .type   x86_sse_apply_gain_to_buffer,@function
 254
 255 x86_sse_apply_gain_to_buffer:
 256
 257 #; %rdi  float          *buf    32(%rbp)
 258 #; %rsi  unsigned int   nframes
 259 #; %xmm0 float          gain
 260 #; %xmm1 float          buf[0]
 261
 262         pushq %rbp
 263         movq %rsp, %rbp
 264
 265         #; save %rdi
 266         pushq %rdi
 267
 268         #; the real function
 269
 270         #; if nframes == 0, go to end
 271         movq %rsi, %rcx #; nframes
 272         cmp     $0, %rcx
 273         je      .AG_END
 274
 275         #; set up the gain buffer (gain is already in %xmm0)
 276         shufps  $0x00, %xmm0, %xmm0
 277
 278         #; Check for alignment
 279
 280         movq %rdi, %rdx #; buf => %rdx
 281         andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 282         jz      .AG_SSE #; if buffer IS aligned
 283
 284         #; PRE-LOOP
 285         #; we iterate 1-3 times, doing normal x87 float comparison
 286         #; so we reach a 16 byte aligned "buf" (=%rdi) value
 287
 288 .AGLP_START:
 289
 290         #; Load next value from the buffer into %xmm1
 291         movss (%rdi), %xmm1
 292         mulss %xmm0, %xmm1
 293         movss %xmm1, (%rdi)
 294
 295         #; increment buffer, decrement counter
 296         addq $4, %rdi #; buf++;
 297
 298         decq %rcx   #; nframes--
 299         jz      .AG_END #; if we run out of frames, we go to the end
 300
 301         addq $4, %rdx #; one non-aligned byte less
 302         cmp $16, %rdx
 303         jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
 304
 305 .AG_SSE:
 306
 307         #; We have reached the 16 byte aligned "buf" ("rdi") value
 308
 309         #; Figure out how many loops we should do
 310         movq %rcx, %rax #; copy remaining nframes to %rax for division
 311         movq $0, %rdx   #; 0 the edx register
 312
 313
 314         pushq %rdi
 315         movq $4, %rdi
 316         divq %rdi #; %rdx = remainder == 0
 317         popq %rdi
 318
 319         #; %rax = SSE iterations
 320         cmp $0, %rax
 321         je .AGPOST_START
 322
 323
 324 .AGLP_SSE:
 325
 326         movaps (%rdi), %xmm1
 327         mulps %xmm0, %xmm1
 328         movaps %xmm1, (%rdi)
 329
 330         addq $16, %rdi
 331         subq $4, %rcx   #; nframes-=4
 332
 333         decq %rax
 334         jnz .AGLP_SSE
 335
 336         #; Next we need to post-process all remaining frames
 337         #; the remaining frame count is in %rcx
 338
 339         #; if no remaining frames, jump to the end
 340         cmp $0, %rcx
 341         andq $3, %rcx #; nframes % 4
 342         je .AG_END
 343
 344 .AGPOST_START:
 345
 346         movss (%rdi), %xmm1
 347         mulss %xmm0, %xmm1
 348         movss %xmm1, (%rdi)
 349
 350         #; increment buffer, decrement counter
 351         addq $4, %rdi #; buf++;
 352
 353         decq %rcx   #; nframes--
 354         jnz     .AGPOST_START #; if we run out of frames, we go to the end
 355
 356 .AG_END:
 357
 358
 359         popq %rdi
 360
 361         #; return
 362         leave
 363         ret
 364
 365 .size   x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
 366 #; end proc
 367
 368
 369 #; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
 370
 371 .globl x86_sse_apply_gain_vector
 372         .type   x86_sse_apply_gain_vector,@function
 373
 374 x86_sse_apply_gain_vector:
 375
 376 #; %rdi float *buf
 377 #; %rsi float *gain_vector
 378 #; %rdx unsigned int nframes
 379
 380         pushq %rbp
 381         movq %rsp, %rbp
 382
 383         #; Save registers
 384         pushq %rdi
 385         pushq %rsi
 386         pushq %rbx
 387
 388         #; if nframes == 0 go to end
 389         cmp $0, %rdx
 390         je .AGA_END
 391
 392         #; Check alignment
 393         movq %rdi, %rax
 394         andq $12, %rax
 395
 396         movq %rsi, %rbx
 397         andq $12, %rbx
 398
 399         cmp %rax,%rbx
 400         jne .AGA_ENDLOOP
 401
 402         cmp $0, %rax
 403         jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
 404
 405 #; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
 406 .AGA_ALIGNLOOP:
 407
 408         movss (%rdi), %xmm0 #; buf => xmm0
 409         movss (%rsi), %xmm1 #; gain value => xmm1
 410         mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
 411         movss %xmm0, (%rdi) #; signal with gain => buf
 412
 413         decq %rdx
 414         jz .AGA_END
 415
 416         addq $4, %rdi #; buf++
 417         addq $4, %rsi #; gab++
 418
 419         addq $4, %rax
 420         cmp $16, %rax
 421         jne .AGA_ALIGNLOOP
 422
 423 #; There are frames left for sure, as that is checked in the beginning
 424 #; and within the previous loop. BUT, there might be less than 4 frames
 425 #; to process
 426
 427 .AGA_SSE:
 428         movq %rdx, %rax #; nframes => %rax
 429         shr $2, %rax #; unsigned divide by 4
 430
 431         cmp $0, %rax  #; Jos toimii ilman t�t�, niin kiva
 432         je .AGA_ENDLOOP
 433
 434 .AGA_SSELOOP:
 435         movaps (%rdi), %xmm0
 436         movaps (%rsi), %xmm1
 437         mulps %xmm1, %xmm0
 438         movaps %xmm0, (%rdi)
 439
 440         addq $16, %rdi
 441         addq $16, %rsi
 442
 443         decq %rax
 444         jnz .AGA_SSELOOP
 445
 446         andq $3, %rdx #; Remaining frames are nframes & 3
 447         jz .AGA_END
 448
 449
 450 #; Inside this loop, we know there are frames left to process
 451 #; but because either there are < 4 frames left, or the buffers
 452 #; are not aligned, we can't use the parallel SSE ops
 453 .AGA_ENDLOOP:
 454         movss (%rdi), %xmm0 #; buf => xmm0
 455         movss (%rsi), %xmm1 #; gain value => xmm1
 456         mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
 457         movss %xmm0, (%rdi) #; signal with gain => buf
 458
 459         addq $4,%rdi
 460         addq $4,%rsi
 461         decq %rdx #; nframes--
 462         jnz .AGA_ENDLOOP
 463
 464 .AGA_END:
 465
 466         popq %rbx
 467         popq %rsi
 468         popq %rdi
 469
 470         leave
 471         ret
 472
 473 .size   x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector
 474 #; end proc
 475
 476
 477 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
 478
 479 .globl x86_sse_compute_peak
 480         .type   x86_sse_compute_peak,@function
 481
 482 abs_mask:
 483         .long   2147483647
 484
 485
 486 x86_sse_compute_peak:
 487
 488 #; %rdi  float          *buf    32(%rbp)
 489 #; %rsi  unsigned int   nframes
 490 #; %xmm0 float          current
 491 #; %xmm1 float          buf[0]
 492
 493         pushq %rbp
 494         movq %rsp, %rbp
 495
 496         #; save %rdi
 497         pushq %rdi
 498
 499         #; if nframes == 0, go to end
 500         movq %rsi, %rcx #; nframes
 501         cmp     $0, %rcx
 502         je      .CP_END
 503
 504         #; create the "abs" mask in %xmm2
 505         movss   abs_mask, %xmm2
 506         shufps  $0x00, %xmm2, %xmm2
 507
 508         #; Check for alignment
 509
 510         #;movq 8(%rbp), %rdi #; buf
 511         movq %rdi, %rdx #; buf => %rdx
 512         andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 513         jz      .CP_SSE #; if buffer IS aligned
 514
 515         #; PRE-LOOP
 516         #; we iterate 1-3 times, doing normal x87 float comparison
 517         #; so we reach a 16 byte aligned "buf" (=%rdi) value
 518
 519 .LP_START:
 520
 521         #; Load next value from the buffer
 522         movss (%rdi), %xmm1
 523         andps %xmm2, %xmm1
 524         maxss %xmm1, %xmm0
 525
 526         #; increment buffer, decrement counter
 527         addq $4, %rdi #; buf++;
 528
 529         decq %rcx   #; nframes--
 530         jz      .CP_END #; if we run out of frames, we go to the end
 531
 532         addq $4, %rdx #; one non-aligned byte less
 533         cmp $16, %rdx
 534         jne .LP_START #; if more non-aligned frames exist, we do a do-over
 535
 536 .CP_SSE:
 537
 538         #; We have reached the 16 byte aligned "buf" ("rdi") value
 539
 540         #; Figure out how many loops we should do
 541         movq %rcx, %rax #; copy remaining nframes to %rax for division
 542
 543         shr $2,%rax #; unsigned divide by 4
 544         jz .POST_START
 545
 546         #; %rax = SSE iterations
 547
 548         #; current maximum is at %xmm0, but we need to ..
 549         shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
 550
 551         #;prefetcht0 16(%rdi)
 552
 553 .LP_SSE:
 554
 555         movaps (%rdi), %xmm1
 556         andps %xmm2, %xmm1
 557         maxps %xmm1, %xmm0
 558
 559         addq $16, %rdi
 560
 561         decq %rax
 562         jnz .LP_SSE
 563
 564         #; Calculate the maximum value contained in the 4 FP's in %xmm0
 565         movaps %xmm0, %xmm1
 566         shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
 567         maxps  %xmm1, %xmm0 #; maximums of the two pairs
 568         movaps %xmm0, %xmm1
 569         shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
 570         maxps  %xmm1, %xmm0
 571
 572         #; now every float in %xmm0 is the same value, current maximum value
 573
 574         #; Next we need to post-process all remaining frames
 575         #; the remaining frame count is in %rcx
 576
 577         #; if no remaining frames, jump to the end
 578
 579         andq $3, %rcx #; nframes % 4
 580         jz .CP_END
 581
 582 .POST_START:
 583
 584         movss (%rdi), %xmm1
 585         andps %xmm2, %xmm1
 586         maxss %xmm1, %xmm0
 587
 588         addq $4, %rdi   #; buf++;
 589
 590         decq %rcx               #; nframes--;
 591         jnz .POST_START
 592
 593 .CP_END:
 594
 595         popq %rdi
 596
 597         #; return
 598         leave
 599         ret
 600
 601 .size   x86_sse_compute_peak, .-x86_sse_compute_peak
 602 #; end proc