Sort various things to reduce merge hell. No functional changes.

[ardour.git] / libs / ardour / sse_functions_xmm.cc
diff --git a/libs/ardour/sse_functions_xmm.cc b/libs/ardour/sse_functions_xmm.cc

index d4330eb37ff2260935342e0555c7031caca2e565..9b37c37912c08ad0965e4e3bf3d2694f48abdcf1 100644 (file)
--- a/libs/ardour/sse_functions_xmm.cc
+++ b/libs/ardour/sse_functions_xmm.cc
@@ -22,7 +22,7 @@
  #include <ardour/types.h>
  
  void
-x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max)
+x86_sse_find_peaks(const ARDOUR::Sample* buf, nframes_t nframes, float *min, float *max)
  {
         __m128 current_max, current_min, work;
  
@@ -43,6 +43,29 @@ x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max)
                 nframes--;
         }
  
+        // use 64 byte prefetch for quadruple quads
+        while (nframes >= 16) {
+                __builtin_prefetch(buf+64,0,0);
+
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                nframes-=16;
+        }
+
         // work through aligned buffers
         while (nframes >= 4) {