2 * Copyright (C) 2007-2009 David Robillard <d@drobilla.net>
3 * Copyright (C) 2007-2015 Paul Davis <paul@linuxaudiosystems.com>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include <xmmintrin.h>
21 #include "ardour/types.h"
24 x86_sse_find_peaks(const ARDOUR::Sample* buf, ARDOUR::pframes_t nframes, float *min, float *max)
26 __m128 current_max, current_min, work;
28 // Load max and min values into all four slots of the XMM registers
29 current_min = _mm_set1_ps(*min);
30 current_max = _mm_set1_ps(*max);
32 // Work input until "buf" reaches 16 byte alignment
33 while ( ((intptr_t)buf) % 16 != 0 && nframes > 0) {
35 // Load the next float into the work buffer
36 work = _mm_set1_ps(*buf);
38 current_min = _mm_min_ps(current_min, work);
39 current_max = _mm_max_ps(current_max, work);
45 // use 64 byte prefetch for quadruple quads
46 while (nframes >= 16) {
48 _mm_prefetch(((char*)buf+64), 0); // A total guess! Assumed to be eqivalent to
49 #else // the line below but waiting to be tested !!
50 __builtin_prefetch(buf+64,0,0);
52 work = _mm_load_ps(buf);
53 current_min = _mm_min_ps(current_min, work);
54 current_max = _mm_max_ps(current_max, work);
56 work = _mm_load_ps(buf);
57 current_min = _mm_min_ps(current_min, work);
58 current_max = _mm_max_ps(current_max, work);
60 work = _mm_load_ps(buf);
61 current_min = _mm_min_ps(current_min, work);
62 current_max = _mm_max_ps(current_max, work);
64 work = _mm_load_ps(buf);
65 current_min = _mm_min_ps(current_min, work);
66 current_max = _mm_max_ps(current_max, work);
71 // work through aligned buffers
72 while (nframes >= 4) {
74 work = _mm_load_ps(buf);
76 current_min = _mm_min_ps(current_min, work);
77 current_max = _mm_max_ps(current_max, work);
83 // work through the rest < 4 samples
84 while ( nframes > 0) {
86 // Load the next float into the work buffer
87 work = _mm_set1_ps(*buf);
89 current_min = _mm_min_ps(current_min, work);
90 current_max = _mm_max_ps(current_max, work);
96 // Find min & max value in current_max through shuffle tricks
99 work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
100 work = _mm_min_ps (work, current_min);
102 work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
103 work = _mm_min_ps (work, current_min);
105 _mm_store_ss(min, work);
108 work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
109 work = _mm_max_ps (work, current_max);
111 work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
112 work = _mm_max_ps (work, current_max);
114 _mm_store_ss(max, work);