remove unused member
[ardour.git] / libs / vamp-pyin / LocalCandidatePYIN.cpp
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
2
3 /*
4     pYIN - A fundamental frequency estimator for monophonic audio
5     Centre for Digital Music, Queen Mary, University of London.
6
7     This program is free software; you can redistribute it and/or
8     modify it under the terms of the GNU General Public License as
9     published by the Free Software Foundation; either version 2 of the
10     License, or (at your option) any later version.  See the file
11     COPYING included with this distribution for more information.
12 */
13
14 #include "LocalCandidatePYIN.h"
15 #include "MonoPitch.h"
16 #include "YinUtil.h"
17
18 #include "vamp-sdk/FFT.h"
19
20 #include <vector>
21 #include <algorithm>
22
23 #include <cstdio>
24 #include <sstream>
25 // #include <iostream>
26 #include <cmath>
27 #include <complex>
28 #include <map>
29
30 #include <boost/math/distributions.hpp>
31
32 using std::string;
33 using std::vector;
34 using std::map;
35 using Vamp::RealTime;
36
37
38 LocalCandidatePYIN::LocalCandidatePYIN(float inputSampleRate) :
39     Plugin(inputSampleRate),
40     m_channels(0),
41     m_stepSize(256),
42     m_blockSize(2048),
43     m_fmin(40),
44     m_fmax(700),
45     m_oPitchTrackCandidates(0),
46     m_threshDistr(2.0f),
47     m_outputUnvoiced(0.0f),
48     m_preciseTime(0.0f),
49     m_pitchProb(0),
50     m_timestamp(0),
51     m_nCandidate(13)
52 {
53 }
54
55 LocalCandidatePYIN::~LocalCandidatePYIN()
56 {
57 }
58
59 string
60 LocalCandidatePYIN::getIdentifier() const
61 {
62     return "localcandidatepyin";
63 }
64
65 string
66 LocalCandidatePYIN::getName() const
67 {
68     return "Local Candidate PYIN";
69 }
70
71 string
72 LocalCandidatePYIN::getDescription() const
73 {
74     return "Monophonic pitch and note tracking based on a probabilistic Yin extension.";
75 }
76
77 string
78 LocalCandidatePYIN::getMaker() const
79 {
80     return "Matthias Mauch";
81 }
82
83 int
84 LocalCandidatePYIN::getPluginVersion() const
85 {
86     // Increment this each time you release a version that behaves
87     // differently from the previous one
88     return 2;
89 }
90
91 string
92 LocalCandidatePYIN::getCopyright() const
93 {
94     return "GPL";
95 }
96
97 LocalCandidatePYIN::InputDomain
98 LocalCandidatePYIN::getInputDomain() const
99 {
100     return TimeDomain;
101 }
102
103 size_t
104 LocalCandidatePYIN::getPreferredBlockSize() const
105 {
106     return 2048;
107 }
108
109 size_t
110 LocalCandidatePYIN::getPreferredStepSize() const
111 {
112     return 256;
113 }
114
115 size_t
116 LocalCandidatePYIN::getMinChannelCount() const
117 {
118     return 1;
119 }
120
121 size_t
122 LocalCandidatePYIN::getMaxChannelCount() const
123 {
124     return 1;
125 }
126
127 LocalCandidatePYIN::ParameterList
128 LocalCandidatePYIN::getParameterDescriptors() const
129 {
130     ParameterList list;
131
132     ParameterDescriptor d;
133
134     d.identifier = "threshdistr";
135     d.name = "Yin threshold distribution";
136     d.description = ".";
137     d.unit = "";
138     d.minValue = 0.0f;
139     d.maxValue = 7.0f;
140     d.defaultValue = 2.0f;
141     d.isQuantized = true;
142     d.quantizeStep = 1.0f;
143     d.valueNames.push_back("Uniform");
144     d.valueNames.push_back("Beta (mean 0.10)");
145     d.valueNames.push_back("Beta (mean 0.15)");
146     d.valueNames.push_back("Beta (mean 0.20)");
147     d.valueNames.push_back("Beta (mean 0.30)");
148     d.valueNames.push_back("Single Value 0.10");
149     d.valueNames.push_back("Single Value 0.15");
150     d.valueNames.push_back("Single Value 0.20");
151     list.push_back(d);
152
153     d.identifier = "outputunvoiced";
154     d.valueNames.clear();
155     d.name = "Output estimates classified as unvoiced?";
156     d.description = ".";
157     d.unit = "";
158     d.minValue = 0.0f;
159     d.maxValue = 2.0f;
160     d.defaultValue = 0.0f;
161     d.isQuantized = true;
162     d.quantizeStep = 1.0f;
163     d.valueNames.push_back("No");
164     d.valueNames.push_back("Yes");
165     d.valueNames.push_back("Yes, as negative frequencies");
166     list.push_back(d);
167
168     d.identifier = "precisetime";
169     d.valueNames.clear();
170     d.name = "Use non-standard precise YIN timing (slow).";
171     d.description = ".";
172     d.unit = "";
173     d.minValue = 0.0f;
174     d.maxValue = 1.0f;
175     d.defaultValue = 0.0f;
176     d.isQuantized = true;
177     d.quantizeStep = 1.0f;
178     list.push_back(d);
179
180     return list;
181 }
182
183 float
184 LocalCandidatePYIN::getParameter(string identifier) const
185 {
186     if (identifier == "threshdistr") {
187             return m_threshDistr;
188     }
189     if (identifier == "outputunvoiced") {
190             return m_outputUnvoiced;
191     }
192     if (identifier == "precisetime") {
193             return m_preciseTime;
194     }
195     return 0.f;
196 }
197
198 void
199 LocalCandidatePYIN::setParameter(string identifier, float value)
200 {
201     if (identifier == "threshdistr")
202     {
203         m_threshDistr = value;
204     }
205     if (identifier == "outputunvoiced")
206     {
207         m_outputUnvoiced = value;
208     }
209     if (identifier == "precisetime")
210     {
211         m_preciseTime = value;
212     }
213 }
214
215 LocalCandidatePYIN::ProgramList
216 LocalCandidatePYIN::getPrograms() const
217 {
218     ProgramList list;
219     return list;
220 }
221
222 string
223 LocalCandidatePYIN::getCurrentProgram() const
224 {
225     return ""; // no programs
226 }
227
228 void
229 LocalCandidatePYIN::selectProgram(string name)
230 {
231 }
232
233 LocalCandidatePYIN::OutputList
234 LocalCandidatePYIN::getOutputDescriptors() const
235 {
236     OutputList outputs;
237
238     OutputDescriptor d;
239
240     d.identifier = "pitchtrackcandidates";
241     d.name = "Pitch track candidates";
242     d.description = "Multiple candidate pitch tracks.";
243     d.unit = "Hz";
244     d.hasFixedBinCount = false;
245     d.hasKnownExtents = true;
246     d.minValue = m_fmin;
247     d.maxValue = 500; //!!!???
248     d.isQuantized = false;
249     d.sampleType = OutputDescriptor::FixedSampleRate;
250     d.sampleRate = (m_inputSampleRate / m_stepSize);
251     d.hasDuration = false;
252     outputs.push_back(d);
253
254     return outputs;
255 }
256
257 bool
258 LocalCandidatePYIN::initialise(size_t channels, size_t stepSize, size_t blockSize)
259 {
260     if (channels < getMinChannelCount() ||
261         channels > getMaxChannelCount()) return false;
262
263 /*
264     std::cerr << "LocalCandidatePYIN::initialise: channels = " << channels
265           << ", stepSize = " << stepSize << ", blockSize = " << blockSize
266           << std::endl;
267 */
268     m_channels = channels;
269     m_stepSize = stepSize;
270     m_blockSize = blockSize;
271
272     reset();
273
274     return true;
275 }
276
277 void
278 LocalCandidatePYIN::reset()
279 {
280     m_pitchProb.clear();
281     m_timestamp.clear();
282 /*
283     std::cerr << "LocalCandidatePYIN::reset"
284           << ", blockSize = " << m_blockSize
285           << std::endl;
286 */
287 }
288
289 LocalCandidatePYIN::FeatureSet
290 LocalCandidatePYIN::process(const float *const *inputBuffers, RealTime timestamp)
291 {
292     int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4;
293     timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, lrintf(m_inputSampleRate));
294
295     double *dInputBuffers = new double[m_blockSize];
296     for (size_t i = 0; i < m_blockSize; ++i) dInputBuffers[i] = inputBuffers[0][i];
297
298     size_t yinBufferSize = m_blockSize/2;
299     double* yinBuffer = new double[yinBufferSize];
300     if (!m_preciseTime) YinUtil::fastDifference(dInputBuffers, yinBuffer, yinBufferSize);
301     else YinUtil::slowDifference(dInputBuffers, yinBuffer, yinBufferSize);
302
303     delete [] dInputBuffers;
304
305     YinUtil::cumulativeDifference(yinBuffer, yinBufferSize);
306
307     float minFrequency = 60;
308     float maxFrequency = 900;
309     vector<double> peakProbability = YinUtil::yinProb(yinBuffer,
310                                                       m_threshDistr,
311                                                       yinBufferSize,
312                                                       m_inputSampleRate/maxFrequency,
313                                                       m_inputSampleRate/minFrequency);
314
315     vector<pair<double, double> > tempPitchProb;
316     for (size_t iBuf = 0; iBuf < yinBufferSize; ++iBuf)
317     {
318         if (peakProbability[iBuf] > 0)
319         {
320             double currentF0 =
321                 m_inputSampleRate * (1.0 /
322                 YinUtil::parabolicInterpolation(yinBuffer, iBuf, yinBufferSize));
323             double tempPitch = 12 * std::log(currentF0/440)/std::log(2.) + 69;
324             tempPitchProb.push_back(pair<double, double>(tempPitch, peakProbability[iBuf]));
325         }
326     }
327     m_pitchProb.push_back(tempPitchProb);
328     m_timestamp.push_back(timestamp);
329
330     delete[] yinBuffer;
331
332     return FeatureSet();
333 }
334
335 LocalCandidatePYIN::FeatureSet
336 LocalCandidatePYIN::getRemainingFeatures()
337 {
338     // timestamp -> candidate number -> value
339     map<RealTime, map<int, float> > featureValues;
340
341     // std::cerr << "in remaining features" << std::endl;
342
343     if (m_pitchProb.empty()) {
344         return FeatureSet();
345     }
346
347     // MONO-PITCH STUFF
348     MonoPitch mp;
349     size_t nFrame = m_timestamp.size();
350     vector<vector<float> > pitchTracks;
351     vector<float> freqSum = vector<float>(m_nCandidate);
352     vector<float> freqNumber = vector<float>(m_nCandidate);
353     vector<float> freqMean = vector<float>(m_nCandidate);
354
355     boost::math::normal normalDist(0, 8); // semitones sd
356     float maxNormalDist = boost::math::pdf(normalDist, 0);
357
358     // Viterbi-decode multiple times with different frequencies emphasised
359     for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate)
360     {
361         pitchTracks.push_back(vector<float>(nFrame));
362         vector<vector<pair<double,double> > > tempPitchProb;
363         float centrePitch = 45 + 3 * iCandidate;
364
365         for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) {
366             tempPitchProb.push_back(vector<pair<double,double> >());
367             float sumProb = 0;
368             float pitch = 0;
369             float prob = 0;
370             for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb)
371             {
372                 pitch = m_pitchProb[iFrame][iProb].first;
373                 prob  = m_pitchProb[iFrame][iProb].second *
374                     boost::math::pdf(normalDist, pitch-centrePitch) /
375                     maxNormalDist * 2;
376                 sumProb += prob;
377                 tempPitchProb[iFrame].push_back(
378                     pair<double,double>(pitch,prob));
379             }
380             for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb)
381             {
382                 tempPitchProb[iFrame][iProb].second /= sumProb;
383             }
384         }
385
386         vector<float> mpOut = mp.process(tempPitchProb);
387         //float prevFreq = 0;
388         for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
389         {
390             if (mpOut[iFrame] > 0) {
391
392                 pitchTracks[iCandidate][iFrame] = mpOut[iFrame];
393                 freqSum[iCandidate] += mpOut[iFrame];
394                 freqNumber[iCandidate]++;
395                 //prevFreq = mpOut[iFrame];
396
397             }
398         }
399         freqMean[iCandidate] = freqSum[iCandidate]*1.0/freqNumber[iCandidate];
400     }
401
402     // find near duplicate pitch tracks
403     vector<size_t> duplicates;
404     for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate) {
405         for (size_t jCandidate = iCandidate+1; jCandidate < m_nCandidate; ++jCandidate) {
406             size_t countEqual = 0;
407             for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
408             {
409                 if ((pitchTracks[jCandidate][iFrame] == 0 && pitchTracks[iCandidate][iFrame] == 0) ||
410                 fabs(pitchTracks[iCandidate][iFrame]/pitchTracks[jCandidate][iFrame]-1)<0.01)
411                 countEqual++;
412             }
413             // std::cerr << "proportion equal: " << (countEqual * 1.0 / nFrame) << std::endl;
414             if (countEqual * 1.0 / nFrame > 0.8) {
415                 if (freqNumber[iCandidate] > freqNumber[jCandidate]) {
416                     duplicates.push_back(jCandidate);
417                 } else if (iCandidate < jCandidate) {
418                     duplicates.push_back(iCandidate);
419                 }
420             }
421         }
422     }
423
424     // now find non-duplicate pitch tracks
425     map<int, int> candidateActuals;
426     map<int, std::string> candidateLabels;
427
428     vector<vector<float> > outputFrequencies;
429     for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) outputFrequencies.push_back(vector<float>());
430
431     int actualCandidateNumber = 0;
432     for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate)
433     {
434         bool isDuplicate = false;
435         for (size_t i = 0; i < duplicates.size(); ++i) {
436
437             if (duplicates[i] == iCandidate) {
438                 isDuplicate = true;
439                 break;
440             }
441         }
442         if (!isDuplicate && freqNumber[iCandidate] > 0.5*nFrame)
443         {
444             std::ostringstream convert;
445             convert << actualCandidateNumber++;
446             candidateLabels[iCandidate] = convert.str();
447             candidateActuals[iCandidate] = actualCandidateNumber;
448             // std::cerr << iCandidate << " " << actualCandidateNumber << " " << freqNumber[iCandidate] << " " << freqMean[iCandidate] << std::endl;
449             for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
450             {
451                 if (pitchTracks[iCandidate][iFrame] > 0)
452                 {
453                     // featureValues[m_timestamp[iFrame]][iCandidate] =
454                     //     pitchTracks[iCandidate][iFrame];
455                     outputFrequencies[iFrame].push_back(pitchTracks[iCandidate][iFrame]);
456                 } else {
457                     outputFrequencies[iFrame].push_back(0);
458                 }
459             }
460         }
461         // fs[m_oPitchTrackCandidates].push_back(f);
462     }
463
464     // adapt our features so as to return a stack of candidate values
465     // per frame
466
467     FeatureSet fs;
468
469     for (size_t iFrame = 0; iFrame < nFrame; ++iFrame){
470         Feature f;
471         f.hasTimestamp = true;
472         f.timestamp = m_timestamp[iFrame];
473         f.values = outputFrequencies[iFrame];
474         fs[0].push_back(f);
475     }
476
477     // I stopped using Chris's map stuff below because I couldn't get my head around it
478     //
479     // for (map<RealTime, map<int, float> >::const_iterator i =
480     //          featureValues.begin(); i != featureValues.end(); ++i) {
481     //     Feature f;
482     //     f.hasTimestamp = true;
483     //     f.timestamp = i->first;
484     //     int nextCandidate = candidateActuals.begin()->second;
485     //     for (map<int, float>::const_iterator j =
486     //              i->second.begin(); j != i->second.end(); ++j) {
487     //         while (candidateActuals[j->first] > nextCandidate) {
488     //             f.values.push_back(0);
489     //             ++nextCandidate;
490     //         }
491     //         f.values.push_back(j->second);
492     //         nextCandidate = j->first + 1;
493     //     }
494     //     //!!! can't use labels?
495     //     fs[0].push_back(f);
496     // }
497
498     return fs;
499 }