Added optimized AVX function for sample processing
authorPaul Davis <paul@linuxaudiosystems.com>
Wed, 13 May 2015 01:07:09 +0000 (21:07 -0400)
committerPaul Davis <paul@linuxaudiosystems.com>
Mon, 29 Jun 2015 18:18:13 +0000 (14:18 -0400)
Added AVX versions of existing 5 SSE functions. Added 6th AVX function to copy vectors which is 1.5 times faster then memcpy.
Data consistency and validness  is fully tested after processing with new AVX functions on aligned and non aligned buffers.

libs/ardour/ardour/mix.h
libs/ardour/ardour/runtime_functions.h
libs/ardour/globals.cc
libs/ardour/mix.cc
libs/ardour/wscript
libs/backends/wavesaudio/waves_audiobackend.cc
libs/backends/wavesaudio/waves_audioport.cc
libs/pbd/pbd/fpu.h
libs/pbd/wscript
wscript

index 3cd9a3e60f24e76693055f765a0ee4da40a5b29d..2db444d02b6a81f59cad21802668c1426488f575 100644 (file)
@@ -33,7 +33,17 @@ extern "C" {
        LIBARDOUR_API void  x86_sse_mix_buffers_no_gain  (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
 }
 
+extern "C" {
+/* AVX functions */
+       LIBARDOUR_API float x86_sse_avx_compute_peak         (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
+       LIBARDOUR_API void  x86_sse_avx_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
+       LIBARDOUR_API void  x86_sse_avx_mix_buffers_with_gain(ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
+       LIBARDOUR_API void  x86_sse_avx_mix_buffers_no_gain  (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+       LIBARDOUR_API void  x86_sse_avx_copy_vector          (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+}
+
 LIBARDOUR_API void  x86_sse_find_peaks               (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
+LIBARDOUR_API void  x86_sse_avx_find_peaks               (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
 
 /* debug wrappers for SSE functions */
 
@@ -41,6 +51,7 @@ LIBARDOUR_API float debug_compute_peak               (const ARDOUR::Sample * buf
 LIBARDOUR_API void  debug_apply_gain_to_buffer       (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  debug_mix_buffers_with_gain      (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  debug_mix_buffers_no_gain        (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+LIBARDOUR_API void  debug_copy_vector                (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
 
 #endif
 
@@ -61,5 +72,6 @@ LIBARDOUR_API void  default_find_peaks                (const ARDOUR::Sample * bu
 LIBARDOUR_API void  default_apply_gain_to_buffer      (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  default_mix_buffers_with_gain     (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  default_mix_buffers_no_gain       (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+LIBARDOUR_API void  default_copy_vector                                  (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
 
 #endif /* __ardour_mix_h__ */
index e1d6b99f6150bb09b2dad308f7abedbbb87afeb8..45d6ec7015c3891cff46ba501c67a7e513df58f8 100644 (file)
 
 namespace ARDOUR {
 
-       typedef float (*compute_peak_t)                 (const ARDOUR::Sample *, pframes_t, float);
-       typedef void  (*find_peaks_t)                   (const ARDOUR::Sample *, pframes_t, float *, float*);
+       typedef float (*compute_peak_t)                     (const ARDOUR::Sample *, pframes_t, float);
+       typedef void  (*find_peaks_t)               (const ARDOUR::Sample *, pframes_t, float *, float*);
        typedef void  (*apply_gain_to_buffer_t)         (ARDOUR::Sample *, pframes_t, float);
        typedef void  (*mix_buffers_with_gain_t)        (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t, float);
        typedef void  (*mix_buffers_no_gain_t)          (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);
+       typedef void  (*copy_vector_t)                      (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);
 
        LIBARDOUR_API extern compute_peak_t             compute_peak;
        LIBARDOUR_API extern find_peaks_t               find_peaks;
        LIBARDOUR_API extern apply_gain_to_buffer_t     apply_gain_to_buffer;
        LIBARDOUR_API extern mix_buffers_with_gain_t    mix_buffers_with_gain;
        LIBARDOUR_API extern mix_buffers_no_gain_t      mix_buffers_no_gain;
+       LIBARDOUR_API extern copy_vector_t                      copy_vector;
 }
 
 #endif /* __ardour_runtime_functions_h__ */
index 288e69dc9e55161c221f93b6a7ca8d4cfcd4ccff..fa6f833d94eb29fe64980321e4f4a7228efdf9db 100644 (file)
@@ -131,6 +131,7 @@ find_peaks_t            ARDOUR::find_peaks = 0;
 apply_gain_to_buffer_t  ARDOUR::apply_gain_to_buffer = 0;
 mix_buffers_with_gain_t ARDOUR::mix_buffers_with_gain = 0;
 mix_buffers_no_gain_t   ARDOUR::mix_buffers_no_gain = 0;
+copy_vector_t                  ARDOUR::copy_vector = 0;
 
 PBD::Signal1<void,std::string> ARDOUR::BootMessage;
 PBD::Signal3<void,std::string,std::string,bool> ARDOUR::PluginScanMessage;
@@ -160,7 +161,21 @@ setup_hardware_optimization (bool try_optimization)
 
 #if defined (ARCH_X86) && defined (BUILD_SSE_OPTIMIZATIONS)
 
-               if (fpu.has_sse()) {
+               if (fpu.has_avx()) {
+
+                       info << "Using AVX optimized routines" << endmsg;
+
+                       // AVX SET
+                       compute_peak          = x86_sse_avx_compute_peak;
+                       find_peaks            = x86_sse_avx_find_peaks;
+                       apply_gain_to_buffer  = x86_sse_avx_apply_gain_to_buffer;
+                       mix_buffers_with_gain = x86_sse_avx_mix_buffers_with_gain;
+                       mix_buffers_no_gain   = x86_sse_avx_mix_buffers_no_gain;
+                       copy_vector           = x86_sse_avx_copy_vector;
+
+                       generic_mix_functions = false;
+
+               } else if (fpu.has_sse()) {
 
                        info << "Using SSE optimized routines" << endmsg;
 
@@ -170,6 +185,7 @@ setup_hardware_optimization (bool try_optimization)
                        apply_gain_to_buffer  = x86_sse_apply_gain_to_buffer;
                        mix_buffers_with_gain = x86_sse_mix_buffers_with_gain;
                        mix_buffers_no_gain   = x86_sse_mix_buffers_no_gain;
+                       copy_vector           = default_copy_vector;
 
                        generic_mix_functions = false;
 
@@ -187,6 +203,7 @@ setup_hardware_optimization (bool try_optimization)
                        apply_gain_to_buffer   = veclib_apply_gain_to_buffer;
                        mix_buffers_with_gain  = veclib_mix_buffers_with_gain;
                        mix_buffers_no_gain    = veclib_mix_buffers_no_gain;
+                       copy_vector            = default_copy_vector;
 
                        generic_mix_functions = false;
 
@@ -206,6 +223,7 @@ setup_hardware_optimization (bool try_optimization)
                apply_gain_to_buffer  = default_apply_gain_to_buffer;
                mix_buffers_with_gain = default_mix_buffers_with_gain;
                mix_buffers_no_gain   = default_mix_buffers_no_gain;
+               copy_vector           = default_copy_vector;
 
                info << "No H/W specific optimizations in use" << endmsg;
        }
index adae68ae7f0305e69544231d958b02dce3dc316b..96ae62448703a9d0406cc8e7eee91939a353782a 100644 (file)
@@ -136,6 +136,12 @@ default_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, p
        }
 }
 
+void
+default_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, pframes_t nframes)
+{
+       memcpy(dst, src, nframes*sizeof(ARDOUR::Sample));
+}
+
 #if defined (__APPLE__) && defined (BUILD_VECLIB_OPTIMIZATIONS)
 #include <Accelerate/Accelerate.h>
 
index 115e12cbecff34649a9ec04aa02b0c879b607a7a..04b99785e569b6253a502d5d8a7cc8333c09aafc 100644 (file)
@@ -417,8 +417,12 @@ def build(bld):
                # not the build host, which in turn can only be inferred from the name
                # of the compiler. 
                if re.search ('/^x86_64/', str(bld.env['CC'])):
-                       obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit_win.s' ]
-
+                       obj.source += [ 'sse_functions_xmm.cc',
+                                       'sse_functions_avx.cc',
+                                       'sse_functions_64bit_win.s',
+                                       'sse_avx_functions_64bit_win.s',
+                                     ]
+        
     # i18n
     if bld.is_defined('ENABLE_NLS'):
         mo_files = bld.path.ant_glob('po/*.mo')
index 5a8fac0a6acb0d2eb5ab2390b7474d8a58d3f994..7fd6da2f39a3e99a2633ec075c6dd513c075f660 100644 (file)
@@ -21,6 +21,8 @@
 #include "waves_audioport.h"
 #include "waves_midiport.h"
 
+#include "ardour/runtime_functions.h"
+
 using namespace ARDOUR;
 
 #if defined __MINGW64__ || defined __MINGW32__
@@ -1170,13 +1172,12 @@ WavesAudioBackend::_read_audio_data_from_device (const float* input_buffer, pfra
 {
 #if defined(PLATFORM_WINDOWS)
     const float **buffer = (const float**)input_buffer;
-    size_t copied_bytes = nframes*sizeof(float);
 
     for(std::vector<WavesAudioPort*>::iterator it = _physical_audio_inputs.begin ();
         it != _physical_audio_inputs.end();
         ++it)
     {
-        memcpy((*it)->buffer(), *buffer, copied_bytes);
+               ARDOUR::copy_vector ((*it)->buffer(), *buffer, nframes);
         ++buffer;
     }
 #else
index 4ded37d906990826919cf1c5c7497979c83b44fc..1249f4d31e39b2b922288633d2a53240245f1396 100644 (file)
@@ -35,20 +35,24 @@ void* WavesAudioPort::get_buffer (pframes_t nframes)
         std::vector<WavesDataPort*>::const_iterator it = get_connections ().begin ();
         
         if (it != get_connections ().end ()) {
-            /* In fact, the static casting to (const WavesAudioPort*) is not that safe.
-             * However, mixing the buffers is assumed in the time critical conditions.
-             * Base class WavesDataPort takes is supposed to provide enough consistentcy
-             * of the connections.
-             */
-            for (memcpy (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes * sizeof (Sample)), ++it;
-                                it != get_connections ().end ();
-                                ++it) {
-                Sample* tgt = buffer ();
-                const Sample* src = ((const WavesAudioPort*)*it)->const_buffer ();
-                for (uint32_t frame = 0; frame < nframes; ++frame, ++tgt, ++src)    {
-                    *tgt += *src;
-                }
-            }
+               /* In fact, the static casting to (const WavesAudioPort*) is not that safe.
+                * However, mixing the buffers is assumed in the time critical conditions.
+                * Base class WavesDataPort takes is supposed to provide enough consistentcy
+                * of the connections.
+                */
+               // get first buffer data
+               // use optimized function to fill the buffer intialy
+               ARDOUR::copy_vector (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes);
+               ++it;
+               
+               // mix the rest
+               for (; it != get_connections ().end (); ++it) {
+                       Sample* tgt = buffer ();
+                       const Sample* src = ((const WavesAudioPort*)*it)->const_buffer ();
+                       for (uint32_t frame = 0; frame < nframes; ++frame, ++tgt, ++src)    {
+                               *tgt += *src;
+                       }
+               }
         }
     }
     return _buffer;
@@ -59,4 +63,4 @@ void
 WavesAudioPort::_wipe_buffer()
 {
        memset (_buffer, 0, sizeof (_buffer));
-}
\ No newline at end of file
+}
index 6627951e9f32514b27972ec87743db276b69b848..260cf4db85937950386aaa7efee98547b0fc1113 100644 (file)
@@ -30,7 +30,8 @@ class LIBPBD_API FPU {
                HasFlushToZero = 0x1,
                HasDenormalsAreZero = 0x2,
                HasSSE = 0x4,
-               HasSSE2 = 0x8
+               HasSSE2 = 0x8,
+               HasAVX = 0x10
        };
 
   public:
@@ -41,6 +42,7 @@ class LIBPBD_API FPU {
        bool has_denormals_are_zero () const { return _flags & HasDenormalsAreZero; }
        bool has_sse () const { return _flags & HasSSE; }
        bool has_sse2 () const { return _flags & HasSSE2; }
+       bool has_avx () const { return _flags & HasAVX; }
        
   private:
        Flags _flags;
index 8f947fbb2658d94887f9ac7d5ec9fddf5c1f77f1..27617adfa9a6ec04855af4716ef681d84fbd76f8 100644 (file)
@@ -145,6 +145,7 @@ def build(bld):
     if bld.env['build_target'] == 'x86_64':
         obj.defines += [ 'USE_X86_64_ASM' ]
     if bld.env['build_target'] == 'mingw':
+        obj.defines += [ 'NO_POSIX_MEMALIGN' ]
         obj.source += [ 'windows_special_dirs.cc' ]
         obj.uselib += ' OLE'
 
diff --git a/wscript b/wscript
index 84a8fb3e6c4a6db357cf7922718f1d2e7daa53bc..e9c1da8f06006bf7fb56d9c233348d890ed2a730 100644 (file)
--- a/wscript
+++ b/wscript
@@ -417,12 +417,12 @@ int main() { return 0; }''',
             if (re.search ("(x86_64|AMD64)", cpu) != None):
                 # on Windows sse is supported by 64 bit platforms only
                 build_host_supports_sse = True
-                
+
                 # mingw GCC compiler to uses at&t (Unix specific) assembler dialect by default
                 # compiler_flags.append (["--mmnemonic=att", "msyntax=att")
                 
                 compiler_flags.extend ([ flags_dict['sse'], flags_dict['fpmath-sse'], flags_dict['xmmintrinsics'], flags_dict['attasm'] ])
-                
+
     # end of processor-specific section
 
     # optimization section