1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
4 pYIN - A fundamental frequency estimator for monophonic audio
5 Centre for Digital Music, Queen Mary, University of London.
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version. See the file
11 COPYING included with this distribution for more information.
14 #include "LocalCandidatePYIN.h"
15 #include "MonoPitch.h"
18 #include "vamp-sdk/FFT.h"
25 // #include <iostream>
30 #include <boost/math/distributions.hpp>
38 LocalCandidatePYIN::LocalCandidatePYIN(float inputSampleRate) :
39 Plugin(inputSampleRate),
45 m_oPitchTrackCandidates(0),
47 m_outputUnvoiced(0.0f),
55 LocalCandidatePYIN::~LocalCandidatePYIN()
60 LocalCandidatePYIN::getIdentifier() const
62 return "localcandidatepyin";
66 LocalCandidatePYIN::getName() const
68 return "Local Candidate PYIN";
72 LocalCandidatePYIN::getDescription() const
74 return "Monophonic pitch and note tracking based on a probabilistic Yin extension.";
78 LocalCandidatePYIN::getMaker() const
80 return "Matthias Mauch";
84 LocalCandidatePYIN::getPluginVersion() const
86 // Increment this each time you release a version that behaves
87 // differently from the previous one
92 LocalCandidatePYIN::getCopyright() const
97 LocalCandidatePYIN::InputDomain
98 LocalCandidatePYIN::getInputDomain() const
104 LocalCandidatePYIN::getPreferredBlockSize() const
110 LocalCandidatePYIN::getPreferredStepSize() const
116 LocalCandidatePYIN::getMinChannelCount() const
122 LocalCandidatePYIN::getMaxChannelCount() const
127 LocalCandidatePYIN::ParameterList
128 LocalCandidatePYIN::getParameterDescriptors() const
132 ParameterDescriptor d;
134 d.identifier = "threshdistr";
135 d.name = "Yin threshold distribution";
140 d.defaultValue = 2.0f;
141 d.isQuantized = true;
142 d.quantizeStep = 1.0f;
143 d.valueNames.push_back("Uniform");
144 d.valueNames.push_back("Beta (mean 0.10)");
145 d.valueNames.push_back("Beta (mean 0.15)");
146 d.valueNames.push_back("Beta (mean 0.20)");
147 d.valueNames.push_back("Beta (mean 0.30)");
148 d.valueNames.push_back("Single Value 0.10");
149 d.valueNames.push_back("Single Value 0.15");
150 d.valueNames.push_back("Single Value 0.20");
153 d.identifier = "outputunvoiced";
154 d.valueNames.clear();
155 d.name = "Output estimates classified as unvoiced?";
160 d.defaultValue = 0.0f;
161 d.isQuantized = true;
162 d.quantizeStep = 1.0f;
163 d.valueNames.push_back("No");
164 d.valueNames.push_back("Yes");
165 d.valueNames.push_back("Yes, as negative frequencies");
168 d.identifier = "precisetime";
169 d.valueNames.clear();
170 d.name = "Use non-standard precise YIN timing (slow).";
175 d.defaultValue = 0.0f;
176 d.isQuantized = true;
177 d.quantizeStep = 1.0f;
184 LocalCandidatePYIN::getParameter(string identifier) const
186 if (identifier == "threshdistr") {
187 return m_threshDistr;
189 if (identifier == "outputunvoiced") {
190 return m_outputUnvoiced;
192 if (identifier == "precisetime") {
193 return m_preciseTime;
199 LocalCandidatePYIN::setParameter(string identifier, float value)
201 if (identifier == "threshdistr")
203 m_threshDistr = value;
205 if (identifier == "outputunvoiced")
207 m_outputUnvoiced = value;
209 if (identifier == "precisetime")
211 m_preciseTime = value;
215 LocalCandidatePYIN::ProgramList
216 LocalCandidatePYIN::getPrograms() const
223 LocalCandidatePYIN::getCurrentProgram() const
225 return ""; // no programs
229 LocalCandidatePYIN::selectProgram(string name)
233 LocalCandidatePYIN::OutputList
234 LocalCandidatePYIN::getOutputDescriptors() const
240 d.identifier = "pitchtrackcandidates";
241 d.name = "Pitch track candidates";
242 d.description = "Multiple candidate pitch tracks.";
244 d.hasFixedBinCount = false;
245 d.hasKnownExtents = true;
247 d.maxValue = 500; //!!!???
248 d.isQuantized = false;
249 d.sampleType = OutputDescriptor::FixedSampleRate;
250 d.sampleRate = (m_inputSampleRate / m_stepSize);
251 d.hasDuration = false;
252 outputs.push_back(d);
258 LocalCandidatePYIN::initialise(size_t channels, size_t stepSize, size_t blockSize)
260 if (channels < getMinChannelCount() ||
261 channels > getMaxChannelCount()) return false;
264 std::cerr << "LocalCandidatePYIN::initialise: channels = " << channels
265 << ", stepSize = " << stepSize << ", blockSize = " << blockSize
268 m_channels = channels;
269 m_stepSize = stepSize;
270 m_blockSize = blockSize;
278 LocalCandidatePYIN::reset()
283 std::cerr << "LocalCandidatePYIN::reset"
284 << ", blockSize = " << m_blockSize
289 LocalCandidatePYIN::FeatureSet
290 LocalCandidatePYIN::process(const float *const *inputBuffers, RealTime timestamp)
292 int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4;
293 timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, lrintf(m_inputSampleRate));
295 double *dInputBuffers = new double[m_blockSize];
296 for (size_t i = 0; i < m_blockSize; ++i) dInputBuffers[i] = inputBuffers[0][i];
298 size_t yinBufferSize = m_blockSize/2;
299 double* yinBuffer = new double[yinBufferSize];
300 if (!m_preciseTime) YinUtil::fastDifference(dInputBuffers, yinBuffer, yinBufferSize);
301 else YinUtil::slowDifference(dInputBuffers, yinBuffer, yinBufferSize);
303 delete [] dInputBuffers;
305 YinUtil::cumulativeDifference(yinBuffer, yinBufferSize);
307 float minFrequency = 60;
308 float maxFrequency = 900;
309 vector<double> peakProbability = YinUtil::yinProb(yinBuffer,
312 m_inputSampleRate/maxFrequency,
313 m_inputSampleRate/minFrequency);
315 vector<pair<double, double> > tempPitchProb;
316 for (size_t iBuf = 0; iBuf < yinBufferSize; ++iBuf)
318 if (peakProbability[iBuf] > 0)
321 m_inputSampleRate * (1.0 /
322 YinUtil::parabolicInterpolation(yinBuffer, iBuf, yinBufferSize));
323 double tempPitch = 12 * std::log(currentF0/440)/std::log(2.) + 69;
324 tempPitchProb.push_back(pair<double, double>(tempPitch, peakProbability[iBuf]));
327 m_pitchProb.push_back(tempPitchProb);
328 m_timestamp.push_back(timestamp);
335 LocalCandidatePYIN::FeatureSet
336 LocalCandidatePYIN::getRemainingFeatures()
338 // timestamp -> candidate number -> value
339 map<RealTime, map<int, float> > featureValues;
341 // std::cerr << "in remaining features" << std::endl;
343 if (m_pitchProb.empty()) {
349 size_t nFrame = m_timestamp.size();
350 vector<vector<float> > pitchTracks;
351 vector<float> freqSum = vector<float>(m_nCandidate);
352 vector<float> freqNumber = vector<float>(m_nCandidate);
353 vector<float> freqMean = vector<float>(m_nCandidate);
355 boost::math::normal normalDist(0, 8); // semitones sd
356 float maxNormalDist = boost::math::pdf(normalDist, 0);
358 // Viterbi-decode multiple times with different frequencies emphasised
359 for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate)
361 pitchTracks.push_back(vector<float>(nFrame));
362 vector<vector<pair<double,double> > > tempPitchProb;
363 float centrePitch = 45 + 3 * iCandidate;
365 for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) {
366 tempPitchProb.push_back(vector<pair<double,double> >());
370 for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb)
372 pitch = m_pitchProb[iFrame][iProb].first;
373 prob = m_pitchProb[iFrame][iProb].second *
374 boost::math::pdf(normalDist, pitch-centrePitch) /
377 tempPitchProb[iFrame].push_back(
378 pair<double,double>(pitch,prob));
380 for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb)
382 tempPitchProb[iFrame][iProb].second /= sumProb;
386 vector<float> mpOut = mp.process(tempPitchProb);
387 //float prevFreq = 0;
388 for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
390 if (mpOut[iFrame] > 0) {
392 pitchTracks[iCandidate][iFrame] = mpOut[iFrame];
393 freqSum[iCandidate] += mpOut[iFrame];
394 freqNumber[iCandidate]++;
395 //prevFreq = mpOut[iFrame];
399 freqMean[iCandidate] = freqSum[iCandidate]*1.0/freqNumber[iCandidate];
402 // find near duplicate pitch tracks
403 vector<size_t> duplicates;
404 for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate) {
405 for (size_t jCandidate = iCandidate+1; jCandidate < m_nCandidate; ++jCandidate) {
406 size_t countEqual = 0;
407 for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
409 if ((pitchTracks[jCandidate][iFrame] == 0 && pitchTracks[iCandidate][iFrame] == 0) ||
410 fabs(pitchTracks[iCandidate][iFrame]/pitchTracks[jCandidate][iFrame]-1)<0.01)
413 // std::cerr << "proportion equal: " << (countEqual * 1.0 / nFrame) << std::endl;
414 if (countEqual * 1.0 / nFrame > 0.8) {
415 if (freqNumber[iCandidate] > freqNumber[jCandidate]) {
416 duplicates.push_back(jCandidate);
417 } else if (iCandidate < jCandidate) {
418 duplicates.push_back(iCandidate);
424 // now find non-duplicate pitch tracks
425 map<int, int> candidateActuals;
426 map<int, std::string> candidateLabels;
428 vector<vector<float> > outputFrequencies;
429 for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) outputFrequencies.push_back(vector<float>());
431 int actualCandidateNumber = 0;
432 for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate)
434 bool isDuplicate = false;
435 for (size_t i = 0; i < duplicates.size(); ++i) {
437 if (duplicates[i] == iCandidate) {
442 if (!isDuplicate && freqNumber[iCandidate] > 0.5*nFrame)
444 std::ostringstream convert;
445 convert << actualCandidateNumber++;
446 candidateLabels[iCandidate] = convert.str();
447 candidateActuals[iCandidate] = actualCandidateNumber;
448 // std::cerr << iCandidate << " " << actualCandidateNumber << " " << freqNumber[iCandidate] << " " << freqMean[iCandidate] << std::endl;
449 for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
451 if (pitchTracks[iCandidate][iFrame] > 0)
453 // featureValues[m_timestamp[iFrame]][iCandidate] =
454 // pitchTracks[iCandidate][iFrame];
455 outputFrequencies[iFrame].push_back(pitchTracks[iCandidate][iFrame]);
457 outputFrequencies[iFrame].push_back(0);
461 // fs[m_oPitchTrackCandidates].push_back(f);
464 // adapt our features so as to return a stack of candidate values
469 for (size_t iFrame = 0; iFrame < nFrame; ++iFrame){
471 f.hasTimestamp = true;
472 f.timestamp = m_timestamp[iFrame];
473 f.values = outputFrequencies[iFrame];
477 // I stopped using Chris's map stuff below because I couldn't get my head around it
479 // for (map<RealTime, map<int, float> >::const_iterator i =
480 // featureValues.begin(); i != featureValues.end(); ++i) {
482 // f.hasTimestamp = true;
483 // f.timestamp = i->first;
484 // int nextCandidate = candidateActuals.begin()->second;
485 // for (map<int, float>::const_iterator j =
486 // i->second.begin(); j != i->second.end(); ++j) {
487 // while (candidateActuals[j->first] > nextCandidate) {
488 // f.values.push_back(0);
491 // f.values.push_back(j->second);
492 // nextCandidate = j->first + 1;
494 // //!!! can't use labels?
495 // fs[0].push_back(f);