Strip trailing whitespace and fix other whitespace errors (e.g. space/tab mixing...
[ardour.git] / libs / ardour / sse_functions_xmm.cc
1 /*
2     Copyright (C) 2007 Paul sDavis
3     Written by Sampo Savolainen
4
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14
15     You should have received a copy of the GNU General Public License
16     along with this program; if not, write to the Free Software
17     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
19 */
20
21 #include <xmmintrin.h>
22 #include "ardour/types.h"
23
24 void
25 x86_sse_find_peaks(const ARDOUR::Sample* buf, nframes_t nframes, float *min, float *max)
26 {
27         __m128 current_max, current_min, work;
28
29         // Load max and min values into all four slots of the XMM registers
30         current_min = _mm_set1_ps(*min);
31         current_max = _mm_set1_ps(*max);
32
33         // Work input until "buf" reaches 16 byte alignment
34         while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) {
35
36                 // Load the next float into the work buffer
37                 work = _mm_set1_ps(*buf);
38
39                 current_min = _mm_min_ps(current_min, work);
40                 current_max = _mm_max_ps(current_max, work);
41
42                 buf++;
43                 nframes--;
44         }
45
46         // use 64 byte prefetch for quadruple quads
47         while (nframes >= 16) {
48                 __builtin_prefetch(buf+64,0,0);
49
50                 work = _mm_load_ps(buf);
51                 current_min = _mm_min_ps(current_min, work);
52                 current_max = _mm_max_ps(current_max, work);
53                 buf+=4;
54                 work = _mm_load_ps(buf);
55                 current_min = _mm_min_ps(current_min, work);
56                 current_max = _mm_max_ps(current_max, work);
57                 buf+=4;
58                 work = _mm_load_ps(buf);
59                 current_min = _mm_min_ps(current_min, work);
60                 current_max = _mm_max_ps(current_max, work);
61                 buf+=4;
62                 work = _mm_load_ps(buf);
63                 current_min = _mm_min_ps(current_min, work);
64                 current_max = _mm_max_ps(current_max, work);
65                 buf+=4;
66                 nframes-=16;
67         }
68
69         // work through aligned buffers
70         while (nframes >= 4) {
71
72                 work = _mm_load_ps(buf);
73
74                 current_min = _mm_min_ps(current_min, work);
75                 current_max = _mm_max_ps(current_max, work);
76
77                 buf+=4;
78                 nframes-=4;
79         }
80
81         // work through the rest < 4 samples
82         while ( nframes > 0) {
83
84                 // Load the next float into the work buffer
85                 work = _mm_set1_ps(*buf);
86
87                 current_min = _mm_min_ps(current_min, work);
88                 current_max = _mm_max_ps(current_max, work);
89
90                 buf++;
91                 nframes--;
92         }
93
94         // Find min & max value in current_max through shuffle tricks
95
96         work = current_min;
97         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
98         work = _mm_min_ps (work, current_min);
99         current_min = work;
100         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
101         work = _mm_min_ps (work, current_min);
102
103         _mm_store_ss(min, work);
104
105         work = current_max;
106         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
107         work = _mm_max_ps (work, current_max);
108         current_max = work;
109         work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
110         work = _mm_max_ps (work, current_max);
111
112         _mm_store_ss(max, work);
113 }
114
115
116