libs/vamp-pyin/PYinVamp.cpp

   1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
   2
   3 /*
   4     pYIN - A fundamental frequency estimator for monophonic audio
   5     Centre for Digital Music, Queen Mary, University of London.
   6
   7     This program is free software; you can redistribute it and/or
   8     modify it under the terms of the GNU General Public License as
   9     published by the Free Software Foundation; either version 2 of the
  10     License, or (at your option) any later version.  See the file
  11     COPYING included with this distribution for more information.
  12 */
  13
  14 #include "PYinVamp.h"
  15 #include "MonoNote.h"
  16 #include "MonoPitch.h"
  17
  18 #include "vamp-sdk/FFT.h"
  19
  20 #include <vector>
  21 #include <algorithm>
  22
  23 #include <cstdio>
  24 #include <cmath>
  25 #include <complex>
  26
  27 using std::string;
  28 using std::vector;
  29 using Vamp::RealTime;
  30
  31
  32 PYinVamp::PYinVamp(float inputSampleRate) :
  33     Plugin(inputSampleRate),
  34     m_channels(0),
  35     m_stepSize(256),
  36     m_blockSize(2048),
  37     m_fmin(40),
  38     m_fmax(1600),
  39     m_yin(2048, inputSampleRate, 0.0),
  40     m_oF0Candidates(0),
  41     m_oF0Probs(0),
  42     m_oVoicedProb(0),
  43     m_oCandidateSalience(0),
  44     m_oSmoothedPitchTrack(0),
  45     m_oNotes(0),
  46     m_threshDistr(2.0f),
  47     m_outputUnvoiced(0.0f),
  48     m_preciseTime(0.0f),
  49     m_lowAmp(0.1f),
  50     m_onsetSensitivity(0.7f),
  51     m_pruneThresh(0.1f),
  52     m_pitchProb(0),
  53     m_timestamp(0),
  54     m_level(0)
  55 {
  56 }
  57
  58 PYinVamp::~PYinVamp()
  59 {
  60 }
  61
  62 string
  63 PYinVamp::getIdentifier() const
  64 {
  65     return "pyin";
  66 }
  67
  68 string
  69 PYinVamp::getName() const
  70 {
  71     return "pYin";
  72 }
  73
  74 string
  75 PYinVamp::getDescription() const
  76 {
  77     return "Monophonic pitch and note tracking based on a probabilistic Yin extension.";
  78 }
  79
  80 string
  81 PYinVamp::getMaker() const
  82 {
  83     return "Matthias Mauch";
  84 }
  85
  86 int
  87 PYinVamp::getPluginVersion() const
  88 {
  89     // Increment this each time you release a version that behaves
  90     // differently from the previous one
  91     return 2;
  92 }
  93
  94 string
  95 PYinVamp::getCopyright() const
  96 {
  97     return "GPL";
  98 }
  99
 100 PYinVamp::InputDomain
 101 PYinVamp::getInputDomain() const
 102 {
 103     return TimeDomain;
 104 }
 105
 106 size_t
 107 PYinVamp::getPreferredBlockSize() const
 108 {
 109     return 2048;
 110 }
 111
 112 size_t
 113 PYinVamp::getPreferredStepSize() const
 114 {
 115     return 256;
 116 }
 117
 118 size_t
 119 PYinVamp::getMinChannelCount() const
 120 {
 121     return 1;
 122 }
 123
 124 size_t
 125 PYinVamp::getMaxChannelCount() const
 126 {
 127     return 1;
 128 }
 129
 130 PYinVamp::ParameterList
 131 PYinVamp::getParameterDescriptors() const
 132 {
 133     ParameterList list;
 134
 135     ParameterDescriptor d;
 136
 137     d.identifier = "threshdistr";
 138     d.name = "Yin threshold distribution";
 139     d.description = ".";
 140     d.unit = "";
 141     d.minValue = 0.0f;
 142     d.maxValue = 7.0f;
 143     d.defaultValue = 2.0f;
 144     d.isQuantized = true;
 145     d.quantizeStep = 1.0f;
 146     d.valueNames.push_back("Uniform");
 147     d.valueNames.push_back("Beta (mean 0.10)");
 148     d.valueNames.push_back("Beta (mean 0.15)");
 149     d.valueNames.push_back("Beta (mean 0.20)");
 150     d.valueNames.push_back("Beta (mean 0.30)");
 151     d.valueNames.push_back("Single Value 0.10");
 152     d.valueNames.push_back("Single Value 0.15");
 153     d.valueNames.push_back("Single Value 0.20");
 154     list.push_back(d);
 155
 156     d.identifier = "outputunvoiced";
 157     d.valueNames.clear();
 158     d.name = "Output estimates classified as unvoiced?";
 159     d.description = ".";
 160     d.unit = "";
 161     d.minValue = 0.0f;
 162     d.maxValue = 2.0f;
 163     d.defaultValue = 0.0f;
 164     d.isQuantized = true;
 165     d.quantizeStep = 1.0f;
 166     d.valueNames.push_back("No");
 167     d.valueNames.push_back("Yes");
 168     d.valueNames.push_back("Yes, as negative frequencies");
 169     list.push_back(d);
 170
 171     d.identifier = "precisetime";
 172     d.valueNames.clear();
 173     d.name = "Use non-standard precise YIN timing (slow).";
 174     d.description = ".";
 175     d.unit = "";
 176     d.minValue = 0.0f;
 177     d.maxValue = 1.0f;
 178     d.defaultValue = 0.0f;
 179     d.isQuantized = true;
 180     d.quantizeStep = 1.0f;
 181     list.push_back(d);
 182
 183     d.identifier = "lowampsuppression";
 184     d.valueNames.clear();
 185     d.name = "Suppress low amplitude pitch estimates.";
 186     d.description = ".";
 187     d.unit = "";
 188     d.minValue = 0.0f;
 189     d.maxValue = 1.0f;
 190     d.defaultValue = 0.1f;
 191     d.isQuantized = false;
 192     list.push_back(d);
 193
 194     d.identifier = "onsetsensitivity";
 195     d.valueNames.clear();
 196     d.name = "Onset sensitivity";
 197     d.description = "Adds additional note onsets when RMS increases.";
 198     d.unit = "";
 199     d.minValue = 0.0f;
 200     d.maxValue = 1.0f;
 201     d.defaultValue = 0.7f;
 202     d.isQuantized = false;
 203     list.push_back(d);
 204
 205     d.identifier = "prunethresh";
 206     d.valueNames.clear();
 207     d.name = "Duration pruning threshold.";
 208     d.description = "Prune notes that are shorter than this value.";
 209     d.unit = "";
 210     d.minValue = 0.0f;
 211     d.maxValue = 0.2f;
 212     d.defaultValue = 0.1f;
 213     d.isQuantized = false;
 214     list.push_back(d);
 215
 216     return list;
 217 }
 218
 219 float
 220 PYinVamp::getParameter(string identifier) const
 221 {
 222     if (identifier == "threshdistr") {
 223             return m_threshDistr;
 224     }
 225     if (identifier == "outputunvoiced") {
 226             return m_outputUnvoiced;
 227     }
 228     if (identifier == "precisetime") {
 229             return m_preciseTime;
 230     }
 231     if (identifier == "lowampsuppression") {
 232             return m_lowAmp;
 233     }
 234     if (identifier == "onsetsensitivity") {
 235             return m_onsetSensitivity;
 236     }
 237     if (identifier == "prunethresh") {
 238             return m_pruneThresh;
 239     }
 240     return 0.f;
 241 }
 242
 243 void
 244 PYinVamp::setParameter(string identifier, float value)
 245 {
 246     if (identifier == "threshdistr")
 247     {
 248         m_threshDistr = value;
 249     }
 250     if (identifier == "outputunvoiced")
 251     {
 252         m_outputUnvoiced = value;
 253     }
 254     if (identifier == "precisetime")
 255     {
 256         m_preciseTime = value;
 257     }
 258     if (identifier == "lowampsuppression")
 259     {
 260         m_lowAmp = value;
 261     }
 262     if (identifier == "onsetsensitivity")
 263     {
 264         m_onsetSensitivity = value;
 265     }
 266     if (identifier == "prunethresh")
 267     {
 268         m_pruneThresh = value;
 269     }
 270 }
 271
 272 PYinVamp::ProgramList
 273 PYinVamp::getPrograms() const
 274 {
 275     ProgramList list;
 276     return list;
 277 }
 278
 279 string
 280 PYinVamp::getCurrentProgram() const
 281 {
 282     return ""; // no programs
 283 }
 284
 285 void
 286 PYinVamp::selectProgram(string name)
 287 {
 288 }
 289
 290 PYinVamp::OutputList
 291 PYinVamp::getOutputDescriptors() const
 292 {
 293     OutputList outputs;
 294
 295     OutputDescriptor d;
 296
 297     int outputNumber = 0;
 298
 299     d.identifier = "f0candidates";
 300     d.name = "F0 Candidates";
 301     d.description = "Estimated fundamental frequency candidates.";
 302     d.unit = "Hz";
 303     d.hasFixedBinCount = false;
 304     // d.binCount = 1;
 305     d.hasKnownExtents = true;
 306     d.minValue = m_fmin;
 307     d.maxValue = 500;
 308     d.isQuantized = false;
 309     d.sampleType = OutputDescriptor::FixedSampleRate;
 310     d.sampleRate = (m_inputSampleRate / m_stepSize);
 311     d.hasDuration = false;
 312     outputs.push_back(d);
 313     m_oF0Candidates = outputNumber++;
 314
 315     d.identifier = "f0probs";
 316     d.name = "Candidate Probabilities";
 317     d.description = "Probabilities  of estimated fundamental frequency candidates.";
 318     d.unit = "";
 319     d.hasFixedBinCount = false;
 320     // d.binCount = 1;
 321     d.hasKnownExtents = true;
 322     d.minValue = 0;
 323     d.maxValue = 1;
 324     d.isQuantized = false;
 325     d.sampleType = OutputDescriptor::FixedSampleRate;
 326     d.sampleRate = (m_inputSampleRate / m_stepSize);
 327     d.hasDuration = false;
 328     outputs.push_back(d);
 329     m_oF0Probs = outputNumber++;
 330
 331     d.identifier = "voicedprob";
 332     d.name = "Voiced Probability";
 333     d.description = "Probability that the signal is voiced according to Probabilistic Yin.";
 334     d.unit = "";
 335     d.hasFixedBinCount = true;
 336     d.binCount = 1;
 337     d.hasKnownExtents = true;
 338     d.minValue = 0;
 339     d.maxValue = 1;
 340     d.isQuantized = false;
 341     d.sampleType = OutputDescriptor::FixedSampleRate;
 342     d.sampleRate = (m_inputSampleRate / m_stepSize);
 343     d.hasDuration = false;
 344     outputs.push_back(d);
 345     m_oVoicedProb = outputNumber++;
 346
 347     d.identifier = "candidatesalience";
 348     d.name = "Candidate Salience";
 349     d.description = "Candidate Salience";
 350     d.hasFixedBinCount = true;
 351     d.binCount = m_blockSize / 2;
 352     d.hasKnownExtents = true;
 353     d.minValue = 0;
 354     d.maxValue = 1;
 355     d.isQuantized = false;
 356     d.sampleType = OutputDescriptor::FixedSampleRate;
 357     d.sampleRate = (m_inputSampleRate / m_stepSize);
 358     d.hasDuration = false;
 359     outputs.push_back(d);
 360     m_oCandidateSalience = outputNumber++;
 361
 362     d.identifier = "smoothedpitchtrack";
 363     d.name = "Smoothed Pitch Track";
 364     d.description = ".";
 365     d.unit = "Hz";
 366     d.hasFixedBinCount = true;
 367     d.binCount = 1;
 368     d.hasKnownExtents = false;
 369     // d.minValue = 0;
 370     // d.maxValue = 1;
 371     d.isQuantized = false;
 372     d.sampleType = OutputDescriptor::FixedSampleRate;
 373     d.sampleRate = (m_inputSampleRate / m_stepSize);
 374     d.hasDuration = false;
 375     outputs.push_back(d);
 376     m_oSmoothedPitchTrack = outputNumber++;
 377
 378     d.identifier = "notes";
 379     d.name = "Notes";
 380     d.description = "Derived fixed-pitch note frequencies";
 381     // d.unit = "MIDI unit";
 382     d.unit = "Hz";
 383     d.hasFixedBinCount = true;
 384     d.binCount = 1;
 385     d.hasKnownExtents = false;
 386     d.isQuantized = false;
 387     d.sampleType = OutputDescriptor::VariableSampleRate;
 388     d.sampleRate = (m_inputSampleRate / m_stepSize);
 389     d.hasDuration = true;
 390     outputs.push_back(d);
 391     m_oNotes = outputNumber++;
 392
 393     return outputs;
 394 }
 395
 396 bool
 397 PYinVamp::initialise(size_t channels, size_t stepSize, size_t blockSize)
 398 {
 399     if (channels < getMinChannelCount() ||
 400         channels > getMaxChannelCount()) return false;
 401
 402 /*
 403     std::cerr << "PYinVamp::initialise: channels = " << channels
 404           << ", stepSize = " << stepSize << ", blockSize = " << blockSize
 405           << std::endl;
 406 */
 407     m_channels = channels;
 408     m_stepSize = stepSize;
 409     m_blockSize = blockSize;
 410
 411     reset();
 412
 413     return true;
 414 }
 415
 416 void
 417 PYinVamp::reset()
 418 {
 419     m_yin.setThresholdDistr(m_threshDistr);
 420     m_yin.setFrameSize(m_blockSize);
 421     m_yin.setFast(!m_preciseTime);
 422
 423     m_pitchProb.clear();
 424     m_timestamp.clear();
 425     m_level.clear();
 426 /*
 427     std::cerr << "PYinVamp::reset"
 428           << ", blockSize = " << m_blockSize
 429           << std::endl;
 430 */
 431 }
 432
 433 PYinVamp::FeatureSet
 434 PYinVamp::process(const float *const *inputBuffers, RealTime timestamp)
 435 {
 436     int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4;
 437     timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, lrintf(m_inputSampleRate));
 438
 439     FeatureSet fs;
 440
 441     float rms = 0;
 442
 443     double *dInputBuffers = new double[m_blockSize];
 444     for (size_t i = 0; i < m_blockSize; ++i) {
 445         dInputBuffers[i] = inputBuffers[0][i];
 446         rms += inputBuffers[0][i] * inputBuffers[0][i];
 447     }
 448     rms /= m_blockSize;
 449     rms = sqrt(rms);
 450
 451     bool isLowAmplitude = (rms < m_lowAmp);
 452
 453     Yin::YinOutput yo = m_yin.processProbabilisticYin(dInputBuffers);
 454     delete [] dInputBuffers;
 455
 456     m_level.push_back(yo.rms);
 457
 458     // First, get the things out of the way that we don't want to output
 459     // immediately, but instead save for later.
 460     vector<pair<double, double> > tempPitchProb;
 461     for (size_t iCandidate = 0; iCandidate < yo.freqProb.size(); ++iCandidate)
 462     {
 463         double tempPitch = 12 * std::log(yo.freqProb[iCandidate].first/440)/std::log(2.) + 69;
 464         if (!isLowAmplitude)
 465         {
 466             tempPitchProb.push_back(pair<double, double>
 467                 (tempPitch, yo.freqProb[iCandidate].second));
 468         } else {
 469             float factor = ((rms+0.01*m_lowAmp)/(1.01*m_lowAmp));
 470             tempPitchProb.push_back(pair<double, double>
 471                 (tempPitch, yo.freqProb[iCandidate].second*factor));
 472         }
 473     }
 474     m_pitchProb.push_back(tempPitchProb);
 475     m_timestamp.push_back(timestamp);
 476
 477     // F0 CANDIDATES
 478     Feature f;
 479     f.hasTimestamp = true;
 480     f.timestamp = timestamp;
 481     for (size_t i = 0; i < yo.freqProb.size(); ++i)
 482     {
 483         f.values.push_back(yo.freqProb[i].first);
 484     }
 485     fs[m_oF0Candidates].push_back(f);
 486
 487     // VOICEDPROB
 488     f.values.clear();
 489     float voicedProb = 0;
 490     for (size_t i = 0; i < yo.freqProb.size(); ++i)
 491     {
 492         f.values.push_back(yo.freqProb[i].second);
 493         voicedProb += yo.freqProb[i].second;
 494     }
 495     fs[m_oF0Probs].push_back(f);
 496
 497     f.values.push_back(voicedProb);
 498     fs[m_oVoicedProb].push_back(f);
 499
 500     // SALIENCE -- maybe this should eventually disappear
 501     f.values.clear();
 502     float salienceSum = 0;
 503     for (size_t iBin = 0; iBin < yo.salience.size(); ++iBin)
 504     {
 505         f.values.push_back(yo.salience[iBin]);
 506         salienceSum += yo.salience[iBin];
 507     }
 508     fs[m_oCandidateSalience].push_back(f);
 509
 510     return fs;
 511 }
 512
 513 PYinVamp::FeatureSet
 514 PYinVamp::getRemainingFeatures()
 515 {
 516     FeatureSet fs;
 517     Feature f;
 518     f.hasTimestamp = true;
 519     f.hasDuration = false;
 520
 521     if (m_pitchProb.empty()) {
 522         return fs;
 523     }
 524
 525     // MONO-PITCH STUFF
 526     MonoPitch mp;
 527     vector<float> mpOut = mp.process(m_pitchProb);
 528     for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame)
 529     {
 530         if (mpOut[iFrame] < 0 && (m_outputUnvoiced==0)) continue;
 531         f.timestamp = m_timestamp[iFrame];
 532         f.values.clear();
 533         if (m_outputUnvoiced == 1)
 534         {
 535             f.values.push_back(fabs(mpOut[iFrame]));
 536         } else {
 537             f.values.push_back(mpOut[iFrame]);
 538         }
 539
 540         fs[m_oSmoothedPitchTrack].push_back(f);
 541     }
 542
 543     // MONO-NOTE STUFF
 544 //    std::cerr << "Mono Note Stuff" << std::endl;
 545     MonoNote mn;
 546     std::vector<std::vector<std::pair<double, double> > > smoothedPitch;
 547     for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) {
 548         std::vector<std::pair<double, double> > temp;
 549         if (mpOut[iFrame] > 0)
 550         {
 551             double tempPitch = 12 * std::log(mpOut[iFrame]/440)/std::log(2.) + 69;
 552             temp.push_back(std::pair<double,double>(tempPitch, .9));
 553         }
 554         smoothedPitch.push_back(temp);
 555     }
 556     // vector<MonoNote::FrameOutput> mnOut = mn.process(m_pitchProb);
 557     vector<MonoNote::FrameOutput> mnOut = mn.process(smoothedPitch);
 558
 559     // turning feature into a note feature
 560     f.hasTimestamp = true;
 561     f.hasDuration = true;
 562     f.values.clear();
 563
 564     int onsetFrame = 0;
 565     bool isVoiced = 0;
 566     bool oldIsVoiced = 0;
 567     size_t nFrame = m_pitchProb.size();
 568
 569     float minNoteFrames = (m_inputSampleRate*m_pruneThresh) / m_stepSize;
 570
 571     std::vector<float> notePitchTrack; // collects pitches for one note at a time
 572     for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
 573     {
 574         isVoiced = mnOut[iFrame].noteState < 3
 575                    && smoothedPitch[iFrame].size() > 0
 576                    && (iFrame >= nFrame-2
 577                        || ((m_level[iFrame]/m_level[iFrame+2]) > m_onsetSensitivity));
 578         // std::cerr << m_level[iFrame]/m_level[iFrame-1] << " " << isVoiced << std::endl;
 579         if (isVoiced && iFrame != nFrame-1)
 580         {
 581             if (oldIsVoiced == 0) // beginning of a note
 582             {
 583                 onsetFrame = iFrame;
 584             }
 585             float pitch = smoothedPitch[iFrame][0].first;
 586             notePitchTrack.push_back(pitch); // add to the note's pitch track
 587         } else { // not currently voiced
 588             if (oldIsVoiced == 1) // end of note
 589             {
 590                 // std::cerr << notePitchTrack.size() << " " << minNoteFrames << std::endl;
 591                 if (notePitchTrack.size() >= minNoteFrames)
 592                 {
 593                     std::sort(notePitchTrack.begin(), notePitchTrack.end());
 594                     float medianPitch = notePitchTrack[notePitchTrack.size()/2];
 595                     float medianFreq = std::pow(2,(medianPitch - 69) / 12) * 440;
 596                     f.values.clear();
 597                     f.values.push_back(medianFreq);
 598                     f.timestamp = m_timestamp[onsetFrame];
 599                     f.duration = m_timestamp[iFrame] - m_timestamp[onsetFrame];
 600                     fs[m_oNotes].push_back(f);
 601                 }
 602                 notePitchTrack.clear();
 603             }
 604         }
 605         oldIsVoiced = isVoiced;
 606     }
 607     return fs;
 608 }