Sort various things to reduce merge hell. No functional changes.
[ardour.git] / libs / ardour / sse_functions_xmm.cc
index d4330eb37ff2260935342e0555c7031caca2e565..9b37c37912c08ad0965e4e3bf3d2694f48abdcf1 100644 (file)
@@ -22,7 +22,7 @@
 #include <ardour/types.h>
 
 void
-x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max)
+x86_sse_find_peaks(const ARDOUR::Sample* buf, nframes_t nframes, float *min, float *max)
 {
        __m128 current_max, current_min, work;
 
@@ -43,6 +43,29 @@ x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max)
                nframes--;
        }
 
+        // use 64 byte prefetch for quadruple quads
+        while (nframes >= 16) {
+                __builtin_prefetch(buf+64,0,0);
+
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                nframes-=16;
+        }
+
        // work through aligned buffers
        while (nframes >= 4) {