add new debug bit for backend callbacks
[ardour.git] / libs / ardour / sse_functions_xmm.cc
1 /*
2  * Copyright (C) 2007-2009 David Robillard <d@drobilla.net>
3  * Copyright (C) 2007-2015 Paul Davis <paul@linuxaudiosystems.com>
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program; if not, write to the Free Software Foundation, Inc.,
17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18  */
19
20 #include <xmmintrin.h>
21 #include "ardour/types.h"
22
23 void
24 x86_sse_find_peaks(const ARDOUR::Sample* buf, ARDOUR::pframes_t nframes, float *min, float *max)
25 {
26         __m128 current_max, current_min, work;
27
28         // Load max and min values into all four slots of the XMM registers
29         current_min = _mm_set1_ps(*min);
30         current_max = _mm_set1_ps(*max);
31
32         // Work input until "buf" reaches 16 byte alignment
33         while ( ((intptr_t)buf) % 16 != 0 && nframes > 0) {
34
35                 // Load the next float into the work buffer
36                 work = _mm_set1_ps(*buf);
37
38                 current_min = _mm_min_ps(current_min, work);
39                 current_max = _mm_max_ps(current_max, work);
40
41                 buf++;
42                 nframes--;
43         }
44
45         // use 64 byte prefetch for quadruple quads
46         while (nframes >= 16) {
47 #ifdef COMPILER_MSVC
48                                 _mm_prefetch(((char*)buf+64), 0);  // A total guess! Assumed to be eqivalent to
49 #else                                              // the line below but waiting to be tested !!
50                 __builtin_prefetch(buf+64,0,0);
51 #endif
52                 work = _mm_load_ps(buf);
53                 current_min = _mm_min_ps(current_min, work);
54                 current_max = _mm_max_ps(current_max, work);
55                 buf+=4;
56                 work = _mm_load_ps(buf);
57                 current_min = _mm_min_ps(current_min, work);
58                 current_max = _mm_max_ps(current_max, work);
59                 buf+=4;
60                 work = _mm_load_ps(buf);
61                 current_min = _mm_min_ps(current_min, work);
62                 current_max = _mm_max_ps(current_max, work);
63                 buf+=4;
64                 work = _mm_load_ps(buf);
65                 current_min = _mm_min_ps(current_min, work);
66                 current_max = _mm_max_ps(current_max, work);
67                 buf+=4;
68                 nframes-=16;
69         }
70
71         // work through aligned buffers
72         while (nframes >= 4) {
73
74                 work = _mm_load_ps(buf);
75
76                 current_min = _mm_min_ps(current_min, work);
77                 current_max = _mm_max_ps(current_max, work);
78
79                 buf+=4;
80                 nframes-=4;
81         }
82
83         // work through the rest < 4 samples
84         while ( nframes > 0) {
85
86                 // Load the next float into the work buffer
87                 work = _mm_set1_ps(*buf);
88
89                 current_min = _mm_min_ps(current_min, work);
90                 current_max = _mm_max_ps(current_max, work);
91
92                 buf++;
93                 nframes--;
94         }
95
96         // Find min & max value in current_max through shuffle tricks
97
98         work = current_min;
99         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
100         work = _mm_min_ps (work, current_min);
101         current_min = work;
102         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
103         work = _mm_min_ps (work, current_min);
104
105         _mm_store_ss(min, work);
106
107         work = current_max;
108         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
109         work = _mm_max_ps (work, current_max);
110         current_max = work;
111         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
112         work = _mm_max_ps (work, current_max);
113
114         _mm_store_ss(max, work);
115 }
116
117
118