X-Git-Url: https://main.carlh.net/gitweb/?a=blobdiff_plain;f=libs%2Fardour%2Fsse_functions_xmm.cc;h=48212ea8e1b839363cc08e45e2ce2e86222cf72b;hb=751cc84dd2010a2c669a0155c0e2a0ce47d16592;hp=7b5ea143ecdd6e62bc48575d6e248c1731428897;hpb=75d2f51193f6fd25881a9c766db9078f3b68d80e;p=ardour.git diff --git a/libs/ardour/sse_functions_xmm.cc b/libs/ardour/sse_functions_xmm.cc index 7b5ea143ec..48212ea8e1 100644 --- a/libs/ardour/sse_functions_xmm.cc +++ b/libs/ardour/sse_functions_xmm.cc @@ -1,6 +1,6 @@ /* - Copyright (C) 2007 Paul Davis - Written by Sampo Savolainen + Copyright (C) 2007 Paul sDavis + Written by Sampo Savolainen This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,10 +19,10 @@ */ #include -#include +#include "ardour/types.h" void -x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max) +x86_sse_find_peaks(const ARDOUR::Sample* buf, ARDOUR::pframes_t nframes, float *min, float *max) { __m128 current_max, current_min, work; @@ -31,18 +31,41 @@ x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max) current_max = _mm_set1_ps(*max); // Work input until "buf" reaches 16 byte alignment - while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) { + while ( ((intptr_t)buf) % 16 != 0 && nframes > 0) { // Load the next float into the work buffer work = _mm_set1_ps(*buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); - + buf++; nframes--; } + // use 64 byte prefetch for quadruple quads + while (nframes >= 16) { + __builtin_prefetch(buf+64,0,0); + + work = _mm_load_ps(buf); + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + buf+=4; + work = _mm_load_ps(buf); + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + buf+=4; + work = _mm_load_ps(buf); + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + buf+=4; + work = _mm_load_ps(buf); + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + buf+=4; + nframes-=16; + } + // work through aligned buffers while (nframes >= 4) { @@ -54,7 +77,7 @@ x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max) buf+=4; nframes-=4; } - + // work through the rest < 4 samples while ( nframes > 0) { @@ -63,7 +86,7 @@ x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max) current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); - + buf++; nframes--; }