add some assert for reloading saved plugin pin connections
[ardour.git] / libs / ardour / sse_functions_xmm.cc
index 7b5ea143ecdd6e62bc48575d6e248c1731428897..6eac488a253a3af89df5f7044a1e60d07ed126b2 100644 (file)
@@ -1,6 +1,6 @@
 /*
-    Copyright (C) 2007 Paul Davis
-       Written by Sampo Savolainen
+    Copyright (C) 2007 Paul sDavis
+    Written by Sampo Savolainen
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
 */
 
 #include <xmmintrin.h>
-#include <ardour/types.h>
+#include "ardour/types.h"
 
 void
-x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max)
+x86_sse_find_peaks(const ARDOUR::Sample* buf, ARDOUR::pframes_t nframes, float *min, float *max)
 {
        __m128 current_max, current_min, work;
 
@@ -31,18 +31,44 @@ x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max)
        current_max = _mm_set1_ps(*max);
 
        // Work input until "buf" reaches 16 byte alignment
-       while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) {
+       while ( ((intptr_t)buf) % 16 != 0 && nframes > 0) {
 
                // Load the next float into the work buffer
                work = _mm_set1_ps(*buf);
 
                current_min = _mm_min_ps(current_min, work);
                current_max = _mm_max_ps(current_max, work);
-               
+
                buf++;
                nframes--;
        }
 
+        // use 64 byte prefetch for quadruple quads
+        while (nframes >= 16) {
+#ifdef COMPILER_MSVC
+                               _mm_prefetch(((char*)buf+64), 0);  // A total guess! Assumed to be eqivalent to
+#else                                              // the line below but waiting to be tested !!
+                __builtin_prefetch(buf+64,0,0);
+#endif
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                nframes-=16;
+        }
+
        // work through aligned buffers
        while (nframes >= 4) {
 
@@ -54,7 +80,7 @@ x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max)
                buf+=4;
                nframes-=4;
        }
-       
+
        // work through the rest < 4 samples
        while ( nframes > 0) {
 
@@ -63,7 +89,7 @@ x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max)
 
                current_min = _mm_min_ps(current_min, work);
                current_max = _mm_max_ps(current_max, work);
-               
+
                buf++;
                nframes--;
        }