libs/vamp-pyin/LocalCandidatePYIN.cpp

   1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
   2
   3 /*
   4     pYIN - A fundamental frequency estimator for monophonic audio
   5     Centre for Digital Music, Queen Mary, University of London.
   6
   7     This program is free software; you can redistribute it and/or
   8     modify it under the terms of the GNU General Public License as
   9     published by the Free Software Foundation; either version 2 of the
  10     License, or (at your option) any later version.  See the file
  11     COPYING included with this distribution for more information.
  12 */
  13
  14 #include "LocalCandidatePYIN.h"
  15 #include "MonoPitch.h"
  16 #include "YinUtil.h"
  17
  18 #include "vamp-sdk/FFT.h"
  19
  20 #include <vector>
  21 #include <algorithm>
  22
  23 #include <cstdio>
  24 #include <sstream>
  25 // #include <iostream>
  26 #include <cmath>
  27 #include <complex>
  28 #include <map>
  29
  30 #include <boost/math/distributions.hpp>
  31
  32 using std::string;
  33 using std::vector;
  34 using std::map;
  35 using Vamp::RealTime;
  36
  37
  38 LocalCandidatePYIN::LocalCandidatePYIN(float inputSampleRate) :
  39     Plugin(inputSampleRate),
  40     m_channels(0),
  41     m_stepSize(256),
  42     m_blockSize(2048),
  43     m_fmin(40),
  44     m_fmax(700),
  45     m_oPitchTrackCandidates(0),
  46     m_threshDistr(2.0f),
  47     m_outputUnvoiced(0.0f),
  48     m_preciseTime(0.0f),
  49     m_pitchProb(0),
  50     m_timestamp(0),
  51     m_nCandidate(13)
  52 {
  53 }
  54
  55 LocalCandidatePYIN::~LocalCandidatePYIN()
  56 {
  57 }
  58
  59 string
  60 LocalCandidatePYIN::getIdentifier() const
  61 {
  62     return "localcandidatepyin";
  63 }
  64
  65 string
  66 LocalCandidatePYIN::getName() const
  67 {
  68     return "Local Candidate PYIN";
  69 }
  70
  71 string
  72 LocalCandidatePYIN::getDescription() const
  73 {
  74     return "Monophonic pitch and note tracking based on a probabilistic Yin extension.";
  75 }
  76
  77 string
  78 LocalCandidatePYIN::getMaker() const
  79 {
  80     return "Matthias Mauch";
  81 }
  82
  83 int
  84 LocalCandidatePYIN::getPluginVersion() const
  85 {
  86     // Increment this each time you release a version that behaves
  87     // differently from the previous one
  88     return 2;
  89 }
  90
  91 string
  92 LocalCandidatePYIN::getCopyright() const
  93 {
  94     return "GPL";
  95 }
  96
  97 LocalCandidatePYIN::InputDomain
  98 LocalCandidatePYIN::getInputDomain() const
  99 {
 100     return TimeDomain;
 101 }
 102
 103 size_t
 104 LocalCandidatePYIN::getPreferredBlockSize() const
 105 {
 106     return 2048;
 107 }
 108
 109 size_t
 110 LocalCandidatePYIN::getPreferredStepSize() const
 111 {
 112     return 256;
 113 }
 114
 115 size_t
 116 LocalCandidatePYIN::getMinChannelCount() const
 117 {
 118     return 1;
 119 }
 120
 121 size_t
 122 LocalCandidatePYIN::getMaxChannelCount() const
 123 {
 124     return 1;
 125 }
 126
 127 LocalCandidatePYIN::ParameterList
 128 LocalCandidatePYIN::getParameterDescriptors() const
 129 {
 130     ParameterList list;
 131
 132     ParameterDescriptor d;
 133
 134     d.identifier = "threshdistr";
 135     d.name = "Yin threshold distribution";
 136     d.description = ".";
 137     d.unit = "";
 138     d.minValue = 0.0f;
 139     d.maxValue = 7.0f;
 140     d.defaultValue = 2.0f;
 141     d.isQuantized = true;
 142     d.quantizeStep = 1.0f;
 143     d.valueNames.push_back("Uniform");
 144     d.valueNames.push_back("Beta (mean 0.10)");
 145     d.valueNames.push_back("Beta (mean 0.15)");
 146     d.valueNames.push_back("Beta (mean 0.20)");
 147     d.valueNames.push_back("Beta (mean 0.30)");
 148     d.valueNames.push_back("Single Value 0.10");
 149     d.valueNames.push_back("Single Value 0.15");
 150     d.valueNames.push_back("Single Value 0.20");
 151     list.push_back(d);
 152
 153     d.identifier = "outputunvoiced";
 154     d.valueNames.clear();
 155     d.name = "Output estimates classified as unvoiced?";
 156     d.description = ".";
 157     d.unit = "";
 158     d.minValue = 0.0f;
 159     d.maxValue = 2.0f;
 160     d.defaultValue = 0.0f;
 161     d.isQuantized = true;
 162     d.quantizeStep = 1.0f;
 163     d.valueNames.push_back("No");
 164     d.valueNames.push_back("Yes");
 165     d.valueNames.push_back("Yes, as negative frequencies");
 166     list.push_back(d);
 167
 168     d.identifier = "precisetime";
 169     d.valueNames.clear();
 170     d.name = "Use non-standard precise YIN timing (slow).";
 171     d.description = ".";
 172     d.unit = "";
 173     d.minValue = 0.0f;
 174     d.maxValue = 1.0f;
 175     d.defaultValue = 0.0f;
 176     d.isQuantized = true;
 177     d.quantizeStep = 1.0f;
 178     list.push_back(d);
 179
 180     return list;
 181 }
 182
 183 float
 184 LocalCandidatePYIN::getParameter(string identifier) const
 185 {
 186     if (identifier == "threshdistr") {
 187             return m_threshDistr;
 188     }
 189     if (identifier == "outputunvoiced") {
 190             return m_outputUnvoiced;
 191     }
 192     if (identifier == "precisetime") {
 193             return m_preciseTime;
 194     }
 195     return 0.f;
 196 }
 197
 198 void
 199 LocalCandidatePYIN::setParameter(string identifier, float value)
 200 {
 201     if (identifier == "threshdistr")
 202     {
 203         m_threshDistr = value;
 204     }
 205     if (identifier == "outputunvoiced")
 206     {
 207         m_outputUnvoiced = value;
 208     }
 209     if (identifier == "precisetime")
 210     {
 211         m_preciseTime = value;
 212     }
 213 }
 214
 215 LocalCandidatePYIN::ProgramList
 216 LocalCandidatePYIN::getPrograms() const
 217 {
 218     ProgramList list;
 219     return list;
 220 }
 221
 222 string
 223 LocalCandidatePYIN::getCurrentProgram() const
 224 {
 225     return ""; // no programs
 226 }
 227
 228 void
 229 LocalCandidatePYIN::selectProgram(string name)
 230 {
 231 }
 232
 233 LocalCandidatePYIN::OutputList
 234 LocalCandidatePYIN::getOutputDescriptors() const
 235 {
 236     OutputList outputs;
 237
 238     OutputDescriptor d;
 239
 240     d.identifier = "pitchtrackcandidates";
 241     d.name = "Pitch track candidates";
 242     d.description = "Multiple candidate pitch tracks.";
 243     d.unit = "Hz";
 244     d.hasFixedBinCount = false;
 245     d.hasKnownExtents = true;
 246     d.minValue = m_fmin;
 247     d.maxValue = 500; //!!!???
 248     d.isQuantized = false;
 249     d.sampleType = OutputDescriptor::FixedSampleRate;
 250     d.sampleRate = (m_inputSampleRate / m_stepSize);
 251     d.hasDuration = false;
 252     outputs.push_back(d);
 253
 254     return outputs;
 255 }
 256
 257 bool
 258 LocalCandidatePYIN::initialise(size_t channels, size_t stepSize, size_t blockSize)
 259 {
 260     if (channels < getMinChannelCount() ||
 261         channels > getMaxChannelCount()) return false;
 262
 263 /*
 264     std::cerr << "LocalCandidatePYIN::initialise: channels = " << channels
 265           << ", stepSize = " << stepSize << ", blockSize = " << blockSize
 266           << std::endl;
 267 */
 268     m_channels = channels;
 269     m_stepSize = stepSize;
 270     m_blockSize = blockSize;
 271
 272     reset();
 273
 274     return true;
 275 }
 276
 277 void
 278 LocalCandidatePYIN::reset()
 279 {
 280     m_pitchProb.clear();
 281     m_timestamp.clear();
 282 /*
 283     std::cerr << "LocalCandidatePYIN::reset"
 284           << ", blockSize = " << m_blockSize
 285           << std::endl;
 286 */
 287 }
 288
 289 LocalCandidatePYIN::FeatureSet
 290 LocalCandidatePYIN::process(const float *const *inputBuffers, RealTime timestamp)
 291 {
 292     int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4;
 293     timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, lrintf(m_inputSampleRate));
 294
 295     double *dInputBuffers = new double[m_blockSize];
 296     for (size_t i = 0; i < m_blockSize; ++i) dInputBuffers[i] = inputBuffers[0][i];
 297
 298     size_t yinBufferSize = m_blockSize/2;
 299     double* yinBuffer = new double[yinBufferSize];
 300     if (!m_preciseTime) YinUtil::fastDifference(dInputBuffers, yinBuffer, yinBufferSize);
 301     else YinUtil::slowDifference(dInputBuffers, yinBuffer, yinBufferSize);
 302
 303     delete [] dInputBuffers;
 304
 305     YinUtil::cumulativeDifference(yinBuffer, yinBufferSize);
 306
 307     float minFrequency = 60;
 308     float maxFrequency = 900;
 309     vector<double> peakProbability = YinUtil::yinProb(yinBuffer,
 310                                                       m_threshDistr,
 311                                                       yinBufferSize,
 312                                                       m_inputSampleRate/maxFrequency,
 313                                                       m_inputSampleRate/minFrequency);
 314
 315     vector<pair<double, double> > tempPitchProb;
 316     for (size_t iBuf = 0; iBuf < yinBufferSize; ++iBuf)
 317     {
 318         if (peakProbability[iBuf] > 0)
 319         {
 320             double currentF0 =
 321                 m_inputSampleRate * (1.0 /
 322                 YinUtil::parabolicInterpolation(yinBuffer, iBuf, yinBufferSize));
 323             double tempPitch = 12 * std::log(currentF0/440)/std::log(2.) + 69;
 324             tempPitchProb.push_back(pair<double, double>(tempPitch, peakProbability[iBuf]));
 325         }
 326     }
 327     m_pitchProb.push_back(tempPitchProb);
 328     m_timestamp.push_back(timestamp);
 329
 330     delete[] yinBuffer;
 331
 332     return FeatureSet();
 333 }
 334
 335 LocalCandidatePYIN::FeatureSet
 336 LocalCandidatePYIN::getRemainingFeatures()
 337 {
 338     // timestamp -> candidate number -> value
 339     map<RealTime, map<int, float> > featureValues;
 340
 341     // std::cerr << "in remaining features" << std::endl;
 342
 343     if (m_pitchProb.empty()) {
 344         return FeatureSet();
 345     }
 346
 347     // MONO-PITCH STUFF
 348     MonoPitch mp;
 349     size_t nFrame = m_timestamp.size();
 350     vector<vector<float> > pitchTracks;
 351     vector<float> freqSum = vector<float>(m_nCandidate);
 352     vector<float> freqNumber = vector<float>(m_nCandidate);
 353     vector<float> freqMean = vector<float>(m_nCandidate);
 354
 355     boost::math::normal normalDist(0, 8); // semitones sd
 356     float maxNormalDist = boost::math::pdf(normalDist, 0);
 357
 358     // Viterbi-decode multiple times with different frequencies emphasised
 359     for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate)
 360     {
 361         pitchTracks.push_back(vector<float>(nFrame));
 362         vector<vector<pair<double,double> > > tempPitchProb;
 363         float centrePitch = 45 + 3 * iCandidate;
 364
 365         for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) {
 366             tempPitchProb.push_back(vector<pair<double,double> >());
 367             float sumProb = 0;
 368             float pitch = 0;
 369             float prob = 0;
 370             for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb)
 371             {
 372                 pitch = m_pitchProb[iFrame][iProb].first;
 373                 prob  = m_pitchProb[iFrame][iProb].second *
 374                     boost::math::pdf(normalDist, pitch-centrePitch) /
 375                     maxNormalDist * 2;
 376                 sumProb += prob;
 377                 tempPitchProb[iFrame].push_back(
 378                     pair<double,double>(pitch,prob));
 379             }
 380             for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb)
 381             {
 382                 tempPitchProb[iFrame][iProb].second /= sumProb;
 383             }
 384         }
 385
 386         vector<float> mpOut = mp.process(tempPitchProb);
 387         //float prevFreq = 0;
 388         for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
 389         {
 390             if (mpOut[iFrame] > 0) {
 391
 392                 pitchTracks[iCandidate][iFrame] = mpOut[iFrame];
 393                 freqSum[iCandidate] += mpOut[iFrame];
 394                 freqNumber[iCandidate]++;
 395                 //prevFreq = mpOut[iFrame];
 396
 397             }
 398         }
 399         freqMean[iCandidate] = freqSum[iCandidate]*1.0/freqNumber[iCandidate];
 400     }
 401
 402     // find near duplicate pitch tracks
 403     vector<size_t> duplicates;
 404     for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate) {
 405         for (size_t jCandidate = iCandidate+1; jCandidate < m_nCandidate; ++jCandidate) {
 406             size_t countEqual = 0;
 407             for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
 408             {
 409                 if ((pitchTracks[jCandidate][iFrame] == 0 && pitchTracks[iCandidate][iFrame] == 0) ||
 410                 fabs(pitchTracks[iCandidate][iFrame]/pitchTracks[jCandidate][iFrame]-1)<0.01)
 411                 countEqual++;
 412             }
 413             // std::cerr << "proportion equal: " << (countEqual * 1.0 / nFrame) << std::endl;
 414             if (countEqual * 1.0 / nFrame > 0.8) {
 415                 if (freqNumber[iCandidate] > freqNumber[jCandidate]) {
 416                     duplicates.push_back(jCandidate);
 417                 } else if (iCandidate < jCandidate) {
 418                     duplicates.push_back(iCandidate);
 419                 }
 420             }
 421         }
 422     }
 423
 424     // now find non-duplicate pitch tracks
 425     map<int, int> candidateActuals;
 426     map<int, std::string> candidateLabels;
 427
 428     vector<vector<float> > outputFrequencies;
 429     for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) outputFrequencies.push_back(vector<float>());
 430
 431     int actualCandidateNumber = 0;
 432     for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate)
 433     {
 434         bool isDuplicate = false;
 435         for (size_t i = 0; i < duplicates.size(); ++i) {
 436
 437             if (duplicates[i] == iCandidate) {
 438                 isDuplicate = true;
 439                 break;
 440             }
 441         }
 442         if (!isDuplicate && freqNumber[iCandidate] > 0.5*nFrame)
 443         {
 444             std::ostringstream convert;
 445             convert << actualCandidateNumber++;
 446             candidateLabels[iCandidate] = convert.str();
 447             candidateActuals[iCandidate] = actualCandidateNumber;
 448             // std::cerr << iCandidate << " " << actualCandidateNumber << " " << freqNumber[iCandidate] << " " << freqMean[iCandidate] << std::endl;
 449             for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
 450             {
 451                 if (pitchTracks[iCandidate][iFrame] > 0)
 452                 {
 453                     // featureValues[m_timestamp[iFrame]][iCandidate] =
 454                     //     pitchTracks[iCandidate][iFrame];
 455                     outputFrequencies[iFrame].push_back(pitchTracks[iCandidate][iFrame]);
 456                 } else {
 457                     outputFrequencies[iFrame].push_back(0);
 458                 }
 459             }
 460         }
 461         // fs[m_oPitchTrackCandidates].push_back(f);
 462     }
 463
 464     // adapt our features so as to return a stack of candidate values
 465     // per frame
 466
 467     FeatureSet fs;
 468
 469     for (size_t iFrame = 0; iFrame < nFrame; ++iFrame){
 470         Feature f;
 471         f.hasTimestamp = true;
 472         f.timestamp = m_timestamp[iFrame];
 473         f.values = outputFrequencies[iFrame];
 474         fs[0].push_back(f);
 475     }
 476
 477     // I stopped using Chris's map stuff below because I couldn't get my head around it
 478     //
 479     // for (map<RealTime, map<int, float> >::const_iterator i =
 480     //          featureValues.begin(); i != featureValues.end(); ++i) {
 481     //     Feature f;
 482     //     f.hasTimestamp = true;
 483     //     f.timestamp = i->first;
 484     //     int nextCandidate = candidateActuals.begin()->second;
 485     //     for (map<int, float>::const_iterator j =
 486     //              i->second.begin(); j != i->second.end(); ++j) {
 487     //         while (candidateActuals[j->first] > nextCandidate) {
 488     //             f.values.push_back(0);
 489     //             ++nextCandidate;
 490     //         }
 491     //         f.values.push_back(j->second);
 492     //         nextCandidate = j->first + 1;
 493     //     }
 494     //     //!!! can't use labels?
 495     //     fs[0].push_back(f);
 496     // }
 497
 498     return fs;
 499 }