edit groups tab gets headers (carl); use sampo's SSE find_peaks code; fix build for...
[ardour.git] / libs / ardour / sse_functions_xmm.cc
1 /*
2     Copyright (C) 2007 Paul sDavis
3         Written by Sampo Savolainen
4
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14
15     You should have received a copy of the GNU General Public License
16     along with this program; if not, write to the Free Software
17     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
19 */
20
21 #include <xmmintrin.h>
22 #include <ardour/types.h>
23
24 void
25 x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max)
26 {
27         __m128 current_max, current_min, work;
28
29         // Load max and min values into all four slots of the XMM registers
30         current_min = _mm_set1_ps(*min);
31         current_max = _mm_set1_ps(*max);
32
33         // Work input until "buf" reaches 16 byte alignment
34         while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) {
35
36                 // Load the next float into the work buffer
37                 work = _mm_set1_ps(*buf);
38
39                 current_min = _mm_min_ps(current_min, work);
40                 current_max = _mm_max_ps(current_max, work);
41                 
42                 buf++;
43                 nframes--;
44         }
45
46         // work through aligned buffers
47         while (nframes >= 4) {
48
49                 work = _mm_load_ps(buf);
50
51                 current_min = _mm_min_ps(current_min, work);
52                 current_max = _mm_max_ps(current_max, work);
53
54                 buf+=4;
55                 nframes-=4;
56         }
57         
58         // work through the rest < 4 samples
59         while ( nframes > 0) {
60
61                 // Load the next float into the work buffer
62                 work = _mm_set1_ps(*buf);
63
64                 current_min = _mm_min_ps(current_min, work);
65                 current_max = _mm_max_ps(current_max, work);
66                 
67                 buf++;
68                 nframes--;
69         }
70
71         // Find min & max value in current_max through shuffle tricks
72
73         work = current_min;
74         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
75         work = _mm_min_ps (work, current_min);
76         current_min = work;
77         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
78         work = _mm_min_ps (work, current_min);
79
80         _mm_store_ss(min, work);
81
82         work = current_max;
83         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
84         work = _mm_max_ps (work, current_max);
85         current_max = work;
86         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
87         work = _mm_max_ps (work, current_max);
88
89         _mm_store_ss(max, work);
90 }
91
92
93