1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
4 pYIN - A fundamental frequency estimator for monophonic audio
5 Centre for Digital Music, Queen Mary, University of London.
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version. See the file
11 COPYING included with this distribution for more information.
16 #include "MonoPitch.h"
18 #include "vamp-sdk/FFT.h"
32 PYinVamp::PYinVamp(float inputSampleRate) :
33 Plugin(inputSampleRate),
39 m_yin(2048, inputSampleRate, 0.0),
43 m_oCandidateSalience(0),
44 m_oSmoothedPitchTrack(0),
47 m_outputUnvoiced(0.0f),
50 m_onsetSensitivity(0.7f),
63 PYinVamp::getIdentifier() const
69 PYinVamp::getName() const
75 PYinVamp::getDescription() const
77 return "Monophonic pitch and note tracking based on a probabilistic Yin extension.";
81 PYinVamp::getMaker() const
83 return "Matthias Mauch";
87 PYinVamp::getPluginVersion() const
89 // Increment this each time you release a version that behaves
90 // differently from the previous one
95 PYinVamp::getCopyright() const
100 PYinVamp::InputDomain
101 PYinVamp::getInputDomain() const
107 PYinVamp::getPreferredBlockSize() const
113 PYinVamp::getPreferredStepSize() const
119 PYinVamp::getMinChannelCount() const
125 PYinVamp::getMaxChannelCount() const
130 PYinVamp::ParameterList
131 PYinVamp::getParameterDescriptors() const
135 ParameterDescriptor d;
137 d.identifier = "threshdistr";
138 d.name = "Yin threshold distribution";
143 d.defaultValue = 2.0f;
144 d.isQuantized = true;
145 d.quantizeStep = 1.0f;
146 d.valueNames.push_back("Uniform");
147 d.valueNames.push_back("Beta (mean 0.10)");
148 d.valueNames.push_back("Beta (mean 0.15)");
149 d.valueNames.push_back("Beta (mean 0.20)");
150 d.valueNames.push_back("Beta (mean 0.30)");
151 d.valueNames.push_back("Single Value 0.10");
152 d.valueNames.push_back("Single Value 0.15");
153 d.valueNames.push_back("Single Value 0.20");
156 d.identifier = "outputunvoiced";
157 d.valueNames.clear();
158 d.name = "Output estimates classified as unvoiced?";
163 d.defaultValue = 0.0f;
164 d.isQuantized = true;
165 d.quantizeStep = 1.0f;
166 d.valueNames.push_back("No");
167 d.valueNames.push_back("Yes");
168 d.valueNames.push_back("Yes, as negative frequencies");
171 d.identifier = "precisetime";
172 d.valueNames.clear();
173 d.name = "Use non-standard precise YIN timing (slow).";
178 d.defaultValue = 0.0f;
179 d.isQuantized = true;
180 d.quantizeStep = 1.0f;
183 d.identifier = "lowampsuppression";
184 d.valueNames.clear();
185 d.name = "Suppress low amplitude pitch estimates.";
190 d.defaultValue = 0.1f;
191 d.isQuantized = false;
194 d.identifier = "onsetsensitivity";
195 d.valueNames.clear();
196 d.name = "Onset sensitivity";
197 d.description = "Adds additional note onsets when RMS increases.";
201 d.defaultValue = 0.7f;
202 d.isQuantized = false;
205 d.identifier = "prunethresh";
206 d.valueNames.clear();
207 d.name = "Duration pruning threshold.";
208 d.description = "Prune notes that are shorter than this value.";
212 d.defaultValue = 0.1f;
213 d.isQuantized = false;
220 PYinVamp::getParameter(string identifier) const
222 if (identifier == "threshdistr") {
223 return m_threshDistr;
225 if (identifier == "outputunvoiced") {
226 return m_outputUnvoiced;
228 if (identifier == "precisetime") {
229 return m_preciseTime;
231 if (identifier == "lowampsuppression") {
234 if (identifier == "onsetsensitivity") {
235 return m_onsetSensitivity;
237 if (identifier == "prunethresh") {
238 return m_pruneThresh;
244 PYinVamp::setParameter(string identifier, float value)
246 if (identifier == "threshdistr")
248 m_threshDistr = value;
250 if (identifier == "outputunvoiced")
252 m_outputUnvoiced = value;
254 if (identifier == "precisetime")
256 m_preciseTime = value;
258 if (identifier == "lowampsuppression")
262 if (identifier == "onsetsensitivity")
264 m_onsetSensitivity = value;
266 if (identifier == "prunethresh")
268 m_pruneThresh = value;
272 PYinVamp::ProgramList
273 PYinVamp::getPrograms() const
280 PYinVamp::getCurrentProgram() const
282 return ""; // no programs
286 PYinVamp::selectProgram(string name)
291 PYinVamp::getOutputDescriptors() const
297 int outputNumber = 0;
299 d.identifier = "f0candidates";
300 d.name = "F0 Candidates";
301 d.description = "Estimated fundamental frequency candidates.";
303 d.hasFixedBinCount = false;
305 d.hasKnownExtents = true;
308 d.isQuantized = false;
309 d.sampleType = OutputDescriptor::FixedSampleRate;
310 d.sampleRate = (m_inputSampleRate / m_stepSize);
311 d.hasDuration = false;
312 outputs.push_back(d);
313 m_oF0Candidates = outputNumber++;
315 d.identifier = "f0probs";
316 d.name = "Candidate Probabilities";
317 d.description = "Probabilities of estimated fundamental frequency candidates.";
319 d.hasFixedBinCount = false;
321 d.hasKnownExtents = true;
324 d.isQuantized = false;
325 d.sampleType = OutputDescriptor::FixedSampleRate;
326 d.sampleRate = (m_inputSampleRate / m_stepSize);
327 d.hasDuration = false;
328 outputs.push_back(d);
329 m_oF0Probs = outputNumber++;
331 d.identifier = "voicedprob";
332 d.name = "Voiced Probability";
333 d.description = "Probability that the signal is voiced according to Probabilistic Yin.";
335 d.hasFixedBinCount = true;
337 d.hasKnownExtents = true;
340 d.isQuantized = false;
341 d.sampleType = OutputDescriptor::FixedSampleRate;
342 d.sampleRate = (m_inputSampleRate / m_stepSize);
343 d.hasDuration = false;
344 outputs.push_back(d);
345 m_oVoicedProb = outputNumber++;
347 d.identifier = "candidatesalience";
348 d.name = "Candidate Salience";
349 d.description = "Candidate Salience";
350 d.hasFixedBinCount = true;
351 d.binCount = m_blockSize / 2;
352 d.hasKnownExtents = true;
355 d.isQuantized = false;
356 d.sampleType = OutputDescriptor::FixedSampleRate;
357 d.sampleRate = (m_inputSampleRate / m_stepSize);
358 d.hasDuration = false;
359 outputs.push_back(d);
360 m_oCandidateSalience = outputNumber++;
362 d.identifier = "smoothedpitchtrack";
363 d.name = "Smoothed Pitch Track";
366 d.hasFixedBinCount = true;
368 d.hasKnownExtents = false;
371 d.isQuantized = false;
372 d.sampleType = OutputDescriptor::FixedSampleRate;
373 d.sampleRate = (m_inputSampleRate / m_stepSize);
374 d.hasDuration = false;
375 outputs.push_back(d);
376 m_oSmoothedPitchTrack = outputNumber++;
378 d.identifier = "notes";
380 d.description = "Derived fixed-pitch note frequencies";
381 // d.unit = "MIDI unit";
383 d.hasFixedBinCount = true;
385 d.hasKnownExtents = false;
386 d.isQuantized = false;
387 d.sampleType = OutputDescriptor::VariableSampleRate;
388 d.sampleRate = (m_inputSampleRate / m_stepSize);
389 d.hasDuration = true;
390 outputs.push_back(d);
391 m_oNotes = outputNumber++;
397 PYinVamp::initialise(size_t channels, size_t stepSize, size_t blockSize)
399 if (channels < getMinChannelCount() ||
400 channels > getMaxChannelCount()) return false;
403 std::cerr << "PYinVamp::initialise: channels = " << channels
404 << ", stepSize = " << stepSize << ", blockSize = " << blockSize
407 m_channels = channels;
408 m_stepSize = stepSize;
409 m_blockSize = blockSize;
419 m_yin.setThresholdDistr(m_threshDistr);
420 m_yin.setFrameSize(m_blockSize);
421 m_yin.setFast(!m_preciseTime);
427 std::cerr << "PYinVamp::reset"
428 << ", blockSize = " << m_blockSize
434 PYinVamp::process(const float *const *inputBuffers, RealTime timestamp)
436 int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4;
437 timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, lrintf(m_inputSampleRate));
443 double *dInputBuffers = new double[m_blockSize];
444 for (size_t i = 0; i < m_blockSize; ++i) {
445 dInputBuffers[i] = inputBuffers[0][i];
446 rms += inputBuffers[0][i] * inputBuffers[0][i];
451 bool isLowAmplitude = (rms < m_lowAmp);
453 Yin::YinOutput yo = m_yin.processProbabilisticYin(dInputBuffers);
454 delete [] dInputBuffers;
456 m_level.push_back(yo.rms);
458 // First, get the things out of the way that we don't want to output
459 // immediately, but instead save for later.
460 vector<pair<double, double> > tempPitchProb;
461 for (size_t iCandidate = 0; iCandidate < yo.freqProb.size(); ++iCandidate)
463 double tempPitch = 12 * std::log(yo.freqProb[iCandidate].first/440)/std::log(2.) + 69;
466 tempPitchProb.push_back(pair<double, double>
467 (tempPitch, yo.freqProb[iCandidate].second));
469 float factor = ((rms+0.01*m_lowAmp)/(1.01*m_lowAmp));
470 tempPitchProb.push_back(pair<double, double>
471 (tempPitch, yo.freqProb[iCandidate].second*factor));
474 m_pitchProb.push_back(tempPitchProb);
475 m_timestamp.push_back(timestamp);
479 f.hasTimestamp = true;
480 f.timestamp = timestamp;
481 for (size_t i = 0; i < yo.freqProb.size(); ++i)
483 f.values.push_back(yo.freqProb[i].first);
485 fs[m_oF0Candidates].push_back(f);
489 float voicedProb = 0;
490 for (size_t i = 0; i < yo.freqProb.size(); ++i)
492 f.values.push_back(yo.freqProb[i].second);
493 voicedProb += yo.freqProb[i].second;
495 fs[m_oF0Probs].push_back(f);
497 f.values.push_back(voicedProb);
498 fs[m_oVoicedProb].push_back(f);
500 // SALIENCE -- maybe this should eventually disappear
502 float salienceSum = 0;
503 for (size_t iBin = 0; iBin < yo.salience.size(); ++iBin)
505 f.values.push_back(yo.salience[iBin]);
506 salienceSum += yo.salience[iBin];
508 fs[m_oCandidateSalience].push_back(f);
514 PYinVamp::getRemainingFeatures()
518 f.hasTimestamp = true;
519 f.hasDuration = false;
521 if (m_pitchProb.empty()) {
527 vector<float> mpOut = mp.process(m_pitchProb);
528 for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame)
530 if (mpOut[iFrame] < 0 && (m_outputUnvoiced==0)) continue;
531 f.timestamp = m_timestamp[iFrame];
533 if (m_outputUnvoiced == 1)
535 f.values.push_back(fabs(mpOut[iFrame]));
537 f.values.push_back(mpOut[iFrame]);
540 fs[m_oSmoothedPitchTrack].push_back(f);
544 // std::cerr << "Mono Note Stuff" << std::endl;
546 std::vector<std::vector<std::pair<double, double> > > smoothedPitch;
547 for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) {
548 std::vector<std::pair<double, double> > temp;
549 if (mpOut[iFrame] > 0)
551 double tempPitch = 12 * std::log(mpOut[iFrame]/440)/std::log(2.) + 69;
552 temp.push_back(std::pair<double,double>(tempPitch, .9));
554 smoothedPitch.push_back(temp);
556 // vector<MonoNote::FrameOutput> mnOut = mn.process(m_pitchProb);
557 vector<MonoNote::FrameOutput> mnOut = mn.process(smoothedPitch);
559 // turning feature into a note feature
560 f.hasTimestamp = true;
561 f.hasDuration = true;
566 bool oldIsVoiced = 0;
567 size_t nFrame = m_pitchProb.size();
569 float minNoteFrames = (m_inputSampleRate*m_pruneThresh) / m_stepSize;
571 std::vector<float> notePitchTrack; // collects pitches for one note at a time
572 for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
574 isVoiced = mnOut[iFrame].noteState < 3
575 && smoothedPitch[iFrame].size() > 0
576 && (iFrame >= nFrame-2
577 || ((m_level[iFrame]/m_level[iFrame+2]) > m_onsetSensitivity));
578 // std::cerr << m_level[iFrame]/m_level[iFrame-1] << " " << isVoiced << std::endl;
579 if (isVoiced && iFrame != nFrame-1)
581 if (oldIsVoiced == 0) // beginning of a note
585 float pitch = smoothedPitch[iFrame][0].first;
586 notePitchTrack.push_back(pitch); // add to the note's pitch track
587 } else { // not currently voiced
588 if (oldIsVoiced == 1) // end of note
590 // std::cerr << notePitchTrack.size() << " " << minNoteFrames << std::endl;
591 if (notePitchTrack.size() >= minNoteFrames)
593 std::sort(notePitchTrack.begin(), notePitchTrack.end());
594 float medianPitch = notePitchTrack[notePitchTrack.size()/2];
595 float medianFreq = std::pow(2,(medianPitch - 69) / 12) * 440;
597 f.values.push_back(medianFreq);
598 f.timestamp = m_timestamp[onsetFrame];
599 f.duration = m_timestamp[iFrame] - m_timestamp[onsetFrame];
600 fs[m_oNotes].push_back(f);
602 notePitchTrack.clear();
605 oldIsVoiced = isVoiced;