libs/ardour/sse_functions_xmm.cc

   1 /*
   2  * Copyright (C) 2007-2009 David Robillard <d@drobilla.net>
   3  * Copyright (C) 2007-2015 Paul Davis <paul@linuxaudiosystems.com>
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18  */
  19
  20 #include <xmmintrin.h>
  21 #include "ardour/types.h"
  22
  23 void
  24 x86_sse_find_peaks(const ARDOUR::Sample* buf, ARDOUR::pframes_t nframes, float *min, float *max)
  25 {
  26         __m128 current_max, current_min, work;
  27
  28         // Load max and min values into all four slots of the XMM registers
  29         current_min = _mm_set1_ps(*min);
  30         current_max = _mm_set1_ps(*max);
  31
  32         // Work input until "buf" reaches 16 byte alignment
  33         while ( ((intptr_t)buf) % 16 != 0 && nframes > 0) {
  34
  35                 // Load the next float into the work buffer
  36                 work = _mm_set1_ps(*buf);
  37
  38                 current_min = _mm_min_ps(current_min, work);
  39                 current_max = _mm_max_ps(current_max, work);
  40
  41                 buf++;
  42                 nframes--;
  43         }
  44
  45         // use 64 byte prefetch for quadruple quads
  46         while (nframes >= 16) {
  47 #ifdef COMPILER_MSVC
  48                                 _mm_prefetch(((char*)buf+64), 0);  // A total guess! Assumed to be eqivalent to
  49 #else                                              // the line below but waiting to be tested !!
  50                 __builtin_prefetch(buf+64,0,0);
  51 #endif
  52                 work = _mm_load_ps(buf);
  53                 current_min = _mm_min_ps(current_min, work);
  54                 current_max = _mm_max_ps(current_max, work);
  55                 buf+=4;
  56                 work = _mm_load_ps(buf);
  57                 current_min = _mm_min_ps(current_min, work);
  58                 current_max = _mm_max_ps(current_max, work);
  59                 buf+=4;
  60                 work = _mm_load_ps(buf);
  61                 current_min = _mm_min_ps(current_min, work);
  62                 current_max = _mm_max_ps(current_max, work);
  63                 buf+=4;
  64                 work = _mm_load_ps(buf);
  65                 current_min = _mm_min_ps(current_min, work);
  66                 current_max = _mm_max_ps(current_max, work);
  67                 buf+=4;
  68                 nframes-=16;
  69         }
  70
  71         // work through aligned buffers
  72         while (nframes >= 4) {
  73
  74                 work = _mm_load_ps(buf);
  75
  76                 current_min = _mm_min_ps(current_min, work);
  77                 current_max = _mm_max_ps(current_max, work);
  78
  79                 buf+=4;
  80                 nframes-=4;
  81         }
  82
  83         // work through the rest < 4 samples
  84         while ( nframes > 0) {
  85
  86                 // Load the next float into the work buffer
  87                 work = _mm_set1_ps(*buf);
  88
  89                 current_min = _mm_min_ps(current_min, work);
  90                 current_max = _mm_max_ps(current_max, work);
  91
  92                 buf++;
  93                 nframes--;
  94         }
  95
  96         // Find min & max value in current_max through shuffle tricks
  97
  98         work = current_min;
  99         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
 100         work = _mm_min_ps (work, current_min);
 101         current_min = work;
 102         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
 103         work = _mm_min_ps (work, current_min);
 104
 105         _mm_store_ss(min, work);
 106
 107         work = current_max;
 108         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
 109         work = _mm_max_ps (work, current_max);
 110         current_max = work;
 111         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
 112         work = _mm_max_ps (work, current_max);
 113
 114         _mm_store_ss(max, work);
 115 }
 116
 117
 118