libs/vamp-plugins/SimilarityPlugin.cpp

   1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
   2
   3 /*
   4  * SimilarityPlugin.cpp
   5  *
   6  * Copyright 2009 Centre for Digital Music, Queen Mary, University of London.
   7
   8     This program is free software; you can redistribute it and/or
   9     modify it under the terms of the GNU General Public License as
  10     published by the Free Software Foundation; either version 2 of the
  11     License, or (at your option) any later version.  See the file
  12     COPYING included with this distribution for more information.
  13  */
  14
  15 #include <iostream>
  16 #include <cstdio>
  17
  18 #include "SimilarityPlugin.h"
  19 #include "base/Pitch.h"
  20 #include "dsp/mfcc/MFCC.h"
  21 #include "dsp/chromagram/Chromagram.h"
  22 #include "dsp/rateconversion/Decimator.h"
  23 #include "dsp/rhythm/BeatSpectrum.h"
  24 #include "maths/KLDivergence.h"
  25 #include "maths/CosineDistance.h"
  26 #include "maths/MathUtilities.h"
  27
  28 using std::string;
  29 using std::vector;
  30 using std::cerr;
  31 using std::endl;
  32 using std::ostringstream;
  33
  34 const float
  35 SimilarityPlugin::m_noRhythm = 0.009;
  36
  37 const float
  38 SimilarityPlugin::m_allRhythm = 0.991;
  39
  40 SimilarityPlugin::SimilarityPlugin(float inputSampleRate) :
  41     Plugin(inputSampleRate),
  42     m_type(TypeMFCC),
  43     m_mfcc(0),
  44     m_rhythmfcc(0),
  45     m_chromagram(0),
  46     m_decimator(0),
  47     m_featureColumnSize(20),
  48     m_rhythmWeighting(0.5f),
  49     m_rhythmClipDuration(4.f), // seconds
  50     m_rhythmClipOrigin(40.f), // seconds
  51     m_rhythmClipFrameSize(0),
  52     m_rhythmClipFrames(0),
  53     m_rhythmColumnSize(20),
  54     m_blockSize(0),
  55     m_channels(0),
  56     m_processRate(0),
  57     m_frameNo(0),
  58     m_done(false)
  59 {
  60     int rate = lrintf(m_inputSampleRate);
  61     int internalRate = 22050;
  62     int decimationFactor = rate / internalRate;
  63     if (decimationFactor < 1) decimationFactor = 1;
  64
  65     // must be a power of two
  66     while (decimationFactor & (decimationFactor - 1)) ++decimationFactor;
  67
  68     m_processRate = rate / decimationFactor; // may be 22050, 24000 etc
  69 }
  70
  71 SimilarityPlugin::~SimilarityPlugin()
  72 {
  73     delete m_mfcc;
  74     delete m_rhythmfcc;
  75     delete m_chromagram;
  76     delete m_decimator;
  77 }
  78
  79 string
  80 SimilarityPlugin::getIdentifier() const
  81 {
  82     return "qm-similarity";
  83 }
  84
  85 string
  86 SimilarityPlugin::getName() const
  87 {
  88     return "Similarity";
  89 }
  90
  91 string
  92 SimilarityPlugin::getDescription() const
  93 {
  94     return "Return a distance matrix for similarity between the input audio channels";
  95 }
  96
  97 string
  98 SimilarityPlugin::getMaker() const
  99 {
 100     return "Queen Mary, University of London";
 101 }
 102
 103 int
 104 SimilarityPlugin::getPluginVersion() const
 105 {
 106     return 1;
 107 }
 108
 109 string
 110 SimilarityPlugin::getCopyright() const
 111 {
 112     return "Plugin by Mark Levy, Kurt Jacobson and Chris Cannam.  Copyright (c) 2009 QMUL - All Rights Reserved";
 113 }
 114
 115 size_t
 116 SimilarityPlugin::getMinChannelCount() const
 117 {
 118     return 1;
 119 }
 120
 121 size_t
 122 SimilarityPlugin::getMaxChannelCount() const
 123 {
 124     return 1024;
 125 }
 126
 127 int
 128 SimilarityPlugin::getDecimationFactor() const
 129 {
 130     int rate = lrintf(m_inputSampleRate);
 131     return rate / m_processRate;
 132 }
 133
 134 size_t
 135 SimilarityPlugin::getPreferredStepSize() const
 136 {
 137     if (m_blockSize == 0) calculateBlockSize();
 138
 139     // there is also an assumption to this effect in process()
 140     // (referring to m_fftSize/2 instead of a literal post-decimation
 141     // step size):
 142     return m_blockSize/2;
 143 }
 144
 145 size_t
 146 SimilarityPlugin::getPreferredBlockSize() const
 147 {
 148     if (m_blockSize == 0) calculateBlockSize();
 149     return m_blockSize;
 150 }
 151
 152 void
 153 SimilarityPlugin::calculateBlockSize() const
 154 {
 155     if (m_blockSize != 0) return;
 156     int decimationFactor = getDecimationFactor();
 157     m_blockSize = 2048 * decimationFactor;
 158 }
 159
 160 SimilarityPlugin::ParameterList SimilarityPlugin::getParameterDescriptors() const
 161 {
 162     ParameterList list;
 163
 164     ParameterDescriptor desc;
 165     desc.identifier = "featureType";
 166     desc.name = "Feature Type";
 167     desc.description = "Audio feature used for similarity measure.  Timbral: use the first 20 MFCCs (19 plus C0).  Chromatic: use 12 bin-per-octave chroma.  Rhythmic: compare beat spectra of short regions.";
 168     desc.unit = "";
 169     desc.minValue = 0;
 170     desc.maxValue = 4;
 171     desc.defaultValue = 1;
 172     desc.isQuantized = true;
 173     desc.quantizeStep = 1;
 174     desc.valueNames.push_back("Timbre");
 175     desc.valueNames.push_back("Timbre and Rhythm");
 176     desc.valueNames.push_back("Chroma");
 177     desc.valueNames.push_back("Chroma and Rhythm");
 178     desc.valueNames.push_back("Rhythm only");
 179     list.push_back(desc);
 180 /*
 181     desc.identifier = "rhythmWeighting";
 182     desc.name = "Influence of Rhythm";
 183     desc.description = "Proportion of similarity measure made up from rhythmic similarity component, from 0 (entirely timbral or chromatic) to 100 (entirely rhythmic).";
 184     desc.unit = "%";
 185     desc.minValue = 0;
 186     desc.maxValue = 100;
 187     desc.defaultValue = 0;
 188     desc.isQuantized = false;
 189     desc.valueNames.clear();
 190     list.push_back(desc);
 191 */
 192     return list;
 193 }
 194
 195 float
 196 SimilarityPlugin::getParameter(std::string param) const
 197 {
 198     if (param == "featureType") {
 199
 200         if (m_rhythmWeighting > m_allRhythm) {
 201             return 4;
 202         }
 203
 204         switch (m_type) {
 205
 206         case TypeMFCC:
 207             if (m_rhythmWeighting < m_noRhythm) return 0;
 208             else return 1;
 209             break;
 210
 211         case TypeChroma:
 212             if (m_rhythmWeighting < m_noRhythm) return 2;
 213             else return 3;
 214             break;
 215         }
 216
 217         return 1;
 218
 219 //    } else if (param == "rhythmWeighting") {
 220 //        return nearbyint(m_rhythmWeighting * 100.0);
 221     }
 222
 223     std::cerr << "WARNING: SimilarityPlugin::getParameter: unknown parameter \""
 224               << param << "\"" << std::endl;
 225     return 0.0;
 226 }
 227
 228 void
 229 SimilarityPlugin::setParameter(std::string param, float value)
 230 {
 231     if (param == "featureType") {
 232
 233         int v = int(value + 0.1);
 234
 235         Type newType = m_type;
 236
 237         switch (v) {
 238         case 0: newType = TypeMFCC; m_rhythmWeighting = 0.0f; break;
 239         case 1: newType = TypeMFCC; m_rhythmWeighting = 0.5f; break;
 240         case 2: newType = TypeChroma; m_rhythmWeighting = 0.0f; break;
 241         case 3: newType = TypeChroma; m_rhythmWeighting = 0.5f; break;
 242         case 4: newType = TypeMFCC; m_rhythmWeighting = 1.f; break;
 243         }
 244
 245         if (newType != m_type) m_blockSize = 0;
 246
 247         m_type = newType;
 248         return;
 249
 250 //    } else if (param == "rhythmWeighting") {
 251 //        m_rhythmWeighting = value / 100;
 252 //        return;
 253     }
 254
 255     std::cerr << "WARNING: SimilarityPlugin::setParameter: unknown parameter \""
 256               << param << "\"" << std::endl;
 257 }
 258
 259 SimilarityPlugin::OutputList
 260 SimilarityPlugin::getOutputDescriptors() const
 261 {
 262     OutputList list;
 263
 264     OutputDescriptor similarity;
 265     similarity.identifier = "distancematrix";
 266     similarity.name = "Distance Matrix";
 267     similarity.description = "Distance matrix for similarity metric.  Smaller = more similar.  Should be symmetrical.";
 268     similarity.unit = "";
 269     similarity.hasFixedBinCount = true;
 270     similarity.binCount = m_channels;
 271     similarity.hasKnownExtents = false;
 272     similarity.isQuantized = false;
 273     similarity.sampleType = OutputDescriptor::FixedSampleRate;
 274     similarity.sampleRate = 1;
 275
 276     m_distanceMatrixOutput = list.size();
 277     list.push_back(similarity);
 278
 279     OutputDescriptor simvec;
 280     simvec.identifier = "distancevector";
 281     simvec.name = "Distance from First Channel";
 282     simvec.description = "Distance vector for similarity of each channel to the first channel.  Smaller = more similar.";
 283     simvec.unit = "";
 284     simvec.hasFixedBinCount = true;
 285     simvec.binCount = m_channels;
 286     simvec.hasKnownExtents = false;
 287     simvec.isQuantized = false;
 288     simvec.sampleType = OutputDescriptor::FixedSampleRate;
 289     simvec.sampleRate = 1;
 290
 291     m_distanceVectorOutput = list.size();
 292     list.push_back(simvec);
 293
 294     OutputDescriptor sortvec;
 295     sortvec.identifier = "sorteddistancevector";
 296     sortvec.name = "Ordered Distances from First Channel";
 297     sortvec.description = "Vector of the order of other channels in similarity to the first, followed by distance vector for similarity of each to the first.  Smaller = more similar.";
 298     sortvec.unit = "";
 299     sortvec.hasFixedBinCount = true;
 300     sortvec.binCount = m_channels;
 301     sortvec.hasKnownExtents = false;
 302     sortvec.isQuantized = false;
 303     sortvec.sampleType = OutputDescriptor::FixedSampleRate;
 304     sortvec.sampleRate = 1;
 305
 306     m_sortedVectorOutput = list.size();
 307     list.push_back(sortvec);
 308
 309     OutputDescriptor means;
 310     means.identifier = "means";
 311     means.name = "Feature Means";
 312     means.description = "Means of the feature bins.  Feature time (sec) corresponds to input channel.  Number of bins depends on selected feature type.";
 313     means.unit = "";
 314     means.hasFixedBinCount = true;
 315     means.binCount = m_featureColumnSize;
 316     means.hasKnownExtents = false;
 317     means.isQuantized = false;
 318     means.sampleType = OutputDescriptor::FixedSampleRate;
 319     means.sampleRate = 1;
 320
 321     m_meansOutput = list.size();
 322     list.push_back(means);
 323
 324     OutputDescriptor variances;
 325     variances.identifier = "variances";
 326     variances.name = "Feature Variances";
 327     variances.description = "Variances of the feature bins.  Feature time (sec) corresponds to input channel.  Number of bins depends on selected feature type.";
 328     variances.unit = "";
 329     variances.hasFixedBinCount = true;
 330     variances.binCount = m_featureColumnSize;
 331     variances.hasKnownExtents = false;
 332     variances.isQuantized = false;
 333     variances.sampleType = OutputDescriptor::FixedSampleRate;
 334     variances.sampleRate = 1;
 335
 336     m_variancesOutput = list.size();
 337     list.push_back(variances);
 338
 339     OutputDescriptor beatspectrum;
 340     beatspectrum.identifier = "beatspectrum";
 341     beatspectrum.name = "Beat Spectra";
 342     beatspectrum.description = "Rhythmic self-similarity vectors (beat spectra) for the input channels.  Feature time (sec) corresponds to input channel.  Not returned if rhythm weighting is zero.";
 343     beatspectrum.unit = "";
 344     if (m_rhythmClipFrames > 0) {
 345         beatspectrum.hasFixedBinCount = true;
 346         beatspectrum.binCount = m_rhythmClipFrames / 2;
 347     } else {
 348         beatspectrum.hasFixedBinCount = false;
 349     }
 350     beatspectrum.hasKnownExtents = false;
 351     beatspectrum.isQuantized = false;
 352     beatspectrum.sampleType = OutputDescriptor::FixedSampleRate;
 353     beatspectrum.sampleRate = 1;
 354
 355     m_beatSpectraOutput = list.size();
 356     list.push_back(beatspectrum);
 357
 358     return list;
 359 }
 360
 361 bool
 362 SimilarityPlugin::initialise(size_t channels, size_t stepSize, size_t blockSize)
 363 {
 364     if (channels < getMinChannelCount()) return false;
 365
 366     // Using more than getMaxChannelCount is not actually a problem
 367     // for us.  Using "incorrect" step and block sizes would be fine
 368     // for timbral or chroma similarity, but will break rhythmic
 369     // similarity, so we'd better enforce these.
 370
 371     if (stepSize != getPreferredStepSize()) {
 372         std::cerr << "SimilarityPlugin::initialise: supplied step size "
 373                   << stepSize << " differs from required step size "
 374                   << getPreferredStepSize() << std::endl;
 375         return false;
 376     }
 377
 378     if (blockSize != getPreferredBlockSize()) {
 379         std::cerr << "SimilarityPlugin::initialise: supplied block size "
 380                   << blockSize << " differs from required block size "
 381                   << getPreferredBlockSize() << std::endl;
 382         return false;
 383     }
 384
 385     m_blockSize = blockSize;
 386     m_channels = channels;
 387
 388     m_lastNonEmptyFrame = std::vector<int>(m_channels);
 389     for (int i = 0; i < m_channels; ++i) m_lastNonEmptyFrame[i] = -1;
 390
 391     m_emptyFrameCount = std::vector<int>(m_channels);
 392     for (int i = 0; i < m_channels; ++i) m_emptyFrameCount[i] = 0;
 393
 394     m_frameNo = 0;
 395
 396     int decimationFactor = getDecimationFactor();
 397     if (decimationFactor > 1) {
 398         m_decimator = new Decimator(m_blockSize, decimationFactor);
 399     }
 400
 401     if (m_type == TypeMFCC) {
 402
 403         m_featureColumnSize = 20;
 404
 405         MFCCConfig config(m_processRate);
 406         config.fftsize = 2048;
 407         config.nceps = m_featureColumnSize - 1;
 408         config.want_c0 = true;
 409         config.logpower = 1;
 410         m_mfcc = new MFCC(config);
 411         m_fftSize = m_mfcc->getfftlength();
 412         m_rhythmClipFrameSize = m_fftSize / 4;
 413
 414 //        std::cerr << "MFCC FS = " << config.FS << ", FFT size = " << m_fftSize<< std::endl;
 415
 416     } else if (m_type == TypeChroma) {
 417
 418         m_featureColumnSize = 12;
 419
 420         // For simplicity, aim to have the chroma fft size equal to
 421         // 2048, the same as the mfcc fft size (so the input block
 422         // size does not depend on the feature type and we can use the
 423         // same processing parameters for rhythm etc).  This is also
 424         // why getPreferredBlockSize can confidently return 2048 * the
 425         // decimation factor.
 426
 427         // The fft size for a chromagram is the filterbank Q value
 428         // times the sample rate, divided by the minimum frequency,
 429         // rounded up to the nearest power of two.
 430
 431         double q = 1.0 / (pow(2.0, (1.0 / 12.0)) - 1.0);
 432         double fmin = (q * m_processRate) / 2048.0;
 433
 434         // Round fmin up to the nearest MIDI pitch multiple of 12.
 435         // So long as fmin is greater than 12 to start with, this
 436         // should not change the resulting fft size.
 437
 438         int pmin = Pitch::getPitchForFrequency(float(fmin));
 439         pmin = ((pmin / 12) + 1) * 12;
 440         fmin = Pitch::getFrequencyForPitch(pmin);
 441
 442         float fmax = Pitch::getFrequencyForPitch(pmin + 36);
 443
 444         ChromaConfig config;
 445         config.FS = m_processRate;
 446         config.min = fmin;
 447         config.max = fmax;
 448         config.BPO = 12;
 449         config.CQThresh = 0.0054;
 450         // We don't normalise the chromagram's columns individually;
 451         // we normalise the mean at the end instead
 452         config.normalise = MathUtilities::NormaliseNone;
 453         m_chromagram = new Chromagram(config);
 454         m_fftSize = m_chromagram->getFrameSize();
 455
 456         if (m_fftSize != 2048) {
 457             std::cerr << "WARNING: SimilarityPlugin::initialise: Internal processing FFT size " << m_fftSize << " != expected size 2048 in chroma mode" << std::endl;
 458         }
 459
 460 //        std::cerr << "fftsize = " << m_fftSize << std::endl;
 461
 462         m_rhythmClipFrameSize = m_fftSize / 4;
 463
 464 //        std::cerr << "m_rhythmClipFrameSize = " << m_rhythmClipFrameSize << std::endl;
 465 //        std::cerr << "min = "<< config.min << ", max = " << config.max << std::endl;
 466
 467     } else {
 468
 469         std::cerr << "SimilarityPlugin::initialise: internal error: unknown type " << m_type << std::endl;
 470         return false;
 471     }
 472
 473     if (needRhythm()) {
 474         m_rhythmClipFrames =
 475             int(ceil((m_rhythmClipDuration * m_processRate)
 476                      / m_rhythmClipFrameSize));
 477 //        std::cerr << "SimilarityPlugin::initialise: rhythm clip requires "
 478 //                  << m_rhythmClipFrames << " frames of size "
 479 //                  << m_rhythmClipFrameSize << " at process rate "
 480 //                  << m_processRate << " ( = "
 481 //                  << (float(m_rhythmClipFrames * m_rhythmClipFrameSize) / m_processRate) << " sec )"
 482 //                  << std::endl;
 483
 484         MFCCConfig config(m_processRate);
 485         config.fftsize = m_rhythmClipFrameSize;
 486         config.nceps = m_rhythmColumnSize - 1;
 487         config.want_c0 = true;
 488         config.logpower = 1;
 489         config.window = RectangularWindow; // because no overlap
 490         m_rhythmfcc = new MFCC(config);
 491     }
 492
 493     for (int i = 0; i < m_channels; ++i) {
 494
 495         m_values.push_back(FeatureMatrix());
 496
 497         if (needRhythm()) {
 498             m_rhythmValues.push_back(FeatureColumnQueue());
 499         }
 500     }
 501
 502     m_done = false;
 503
 504     return true;
 505 }
 506
 507 void
 508 SimilarityPlugin::reset()
 509 {
 510     for (int i = 0; i < int(m_values.size()); ++i) {
 511         m_values[i].clear();
 512     }
 513
 514     for (int i = 0; i < int(m_rhythmValues.size()); ++i) {
 515         m_rhythmValues[i].clear();
 516     }
 517
 518     for (int i = 0; i < int(m_lastNonEmptyFrame.size()); ++i) {
 519         m_lastNonEmptyFrame[i] = -1;
 520     }
 521
 522     for (int i = 0; i < int(m_emptyFrameCount.size()); ++i) {
 523         m_emptyFrameCount[i] = 0;
 524     }
 525
 526     m_done = false;
 527 }
 528
 529 SimilarityPlugin::FeatureSet
 530 SimilarityPlugin::process(const float *const *inputBuffers, Vamp::RealTime /* timestamp */)
 531 {
 532     if (m_done) {
 533         return FeatureSet();
 534     }
 535
 536     double *dblbuf = new double[m_blockSize];
 537     double *decbuf = dblbuf;
 538     if (m_decimator) decbuf = new double[m_fftSize];
 539
 540     double *raw = new double[std::max(m_featureColumnSize,
 541                                       m_rhythmColumnSize)];
 542
 543     float threshold = 1e-10;
 544
 545     bool someRhythmFrameNeeded = false;
 546
 547     for (int c = 0; c < m_channels; ++c) {
 548
 549         bool empty = true;
 550
 551         for (int i = 0; i < m_blockSize; ++i) {
 552             float val = inputBuffers[c][i];
 553             if (fabs(val) > threshold) empty = false;
 554             dblbuf[i] = val;
 555         }
 556
 557         if (empty) {
 558             if (needRhythm() && ((m_frameNo % 2) == 0)) {
 559                 for (int i = 0; i < m_fftSize / m_rhythmClipFrameSize; ++i) {
 560                     if (int(m_rhythmValues[c].size()) < m_rhythmClipFrames) {
 561                         FeatureColumn mf(m_rhythmColumnSize);
 562                         for (int i = 0; i < m_rhythmColumnSize; ++i) {
 563                             mf[i] = 0.0;
 564                         }
 565                         m_rhythmValues[c].push_back(mf);
 566                     }
 567                 }
 568             }
 569             m_emptyFrameCount[c]++;
 570             continue;
 571         }
 572
 573         m_lastNonEmptyFrame[c] = m_frameNo;
 574
 575         if (m_decimator) {
 576             m_decimator->process(dblbuf, decbuf);
 577         }
 578
 579         if (needTimbre()) {
 580
 581             FeatureColumn mf(m_featureColumnSize);
 582
 583             if (m_type == TypeMFCC) {
 584                 m_mfcc->process(decbuf, raw);
 585                 for (int i = 0; i < m_featureColumnSize; ++i) {
 586                     mf[i] = raw[i];
 587                 }
 588             } else if (m_type == TypeChroma) {
 589                 double *chroma = m_chromagram->process(decbuf);
 590                 for (int i = 0; i < m_featureColumnSize; ++i) {
 591                     mf[i] = chroma[i];
 592                 }
 593             }
 594
 595             m_values[c].push_back(mf);
 596         }
 597
 598 //        std::cerr << "needRhythm = " << needRhythm() << ", frame = " << m_frameNo << std::endl;
 599
 600         if (needRhythm() && ((m_frameNo % 2) == 0)) {
 601
 602             // The incoming frames are overlapping; we only use every
 603             // other one, because we don't want the overlap (it would
 604             // screw up the rhythm)
 605
 606             int frameOffset = 0;
 607
 608             while (frameOffset + m_rhythmClipFrameSize <= m_fftSize) {
 609
 610                 bool needRhythmFrame = true;
 611
 612                 if (int(m_rhythmValues[c].size()) >= m_rhythmClipFrames) {
 613
 614                     needRhythmFrame = false;
 615
 616                     // assumes hopsize = framesize/2
 617                     float current = m_frameNo * (m_fftSize/2) + frameOffset;
 618                     current = current / m_processRate;
 619                     if (current - m_rhythmClipDuration < m_rhythmClipOrigin) {
 620                         needRhythmFrame = true;
 621                         m_rhythmValues[c].pop_front();
 622                     }
 623
 624 //                    if (needRhythmFrame) {
 625 //                        std::cerr << "at current = " <<current << " (frame = " << m_frameNo << "), have " << m_rhythmValues[c].size() << ", need rhythm = " << needRhythmFrame << std::endl;
 626 //                    }
 627
 628                 }
 629
 630                 if (needRhythmFrame) {
 631
 632                     someRhythmFrameNeeded = true;
 633
 634                     m_rhythmfcc->process(decbuf + frameOffset, raw);
 635
 636                     FeatureColumn mf(m_rhythmColumnSize);
 637                     for (int i = 0; i < m_rhythmColumnSize; ++i) {
 638                         mf[i] = raw[i];
 639                     }
 640
 641                     m_rhythmValues[c].push_back(mf);
 642                 }
 643
 644                 frameOffset += m_rhythmClipFrameSize;
 645             }
 646         }
 647     }
 648
 649     if (!needTimbre() && !someRhythmFrameNeeded && ((m_frameNo % 2) == 0)) {
 650 //        std::cerr << "done!" << std::endl;
 651         m_done = true;
 652     }
 653
 654     if (m_decimator) delete[] decbuf;
 655     delete[] dblbuf;
 656     delete[] raw;
 657
 658     ++m_frameNo;
 659
 660     return FeatureSet();
 661 }
 662
 663 SimilarityPlugin::FeatureMatrix
 664 SimilarityPlugin::calculateTimbral(FeatureSet &returnFeatures)
 665 {
 666     FeatureMatrix m(m_channels); // means
 667     FeatureMatrix v(m_channels); // variances
 668
 669     for (int i = 0; i < m_channels; ++i) {
 670
 671         FeatureColumn mean(m_featureColumnSize), variance(m_featureColumnSize);
 672
 673         for (int j = 0; j < m_featureColumnSize; ++j) {
 674
 675             mean[j] = 0.0;
 676             variance[j] = 0.0;
 677             int count;
 678
 679             // We want to take values up to, but not including, the
 680             // last non-empty frame (which may be partial)
 681
 682             int sz = m_lastNonEmptyFrame[i] - m_emptyFrameCount[i];
 683             if (sz < 0) sz = 0;
 684             if (sz >= int(m_values[i].size())) {
 685                 sz = int(m_values[i].size())-1;
 686             }
 687
 688             count = 0;
 689             for (int k = 0; k < sz; ++k) {
 690                 double val = m_values[i][k][j];
 691                 if (ISNAN(val) || ISINF(val)) continue;
 692                 mean[j] += val;
 693                 ++count;
 694             }
 695             if (count > 0) mean[j] /= count;
 696
 697             count = 0;
 698             for (int k = 0; k < sz; ++k) {
 699                 double val = ((m_values[i][k][j] - mean[j]) *
 700                               (m_values[i][k][j] - mean[j]));
 701                 if (ISNAN(val) || ISINF(val)) continue;
 702                 variance[j] += val;
 703                 ++count;
 704             }
 705             if (count > 0) variance[j] /= count;
 706         }
 707
 708         m[i] = mean;
 709         v[i] = variance;
 710     }
 711
 712     FeatureMatrix distances(m_channels);
 713
 714     if (m_type == TypeMFCC) {
 715
 716         // "Despite the fact that MFCCs extracted from music are
 717         // clearly not Gaussian, [14] showed, somewhat surprisingly,
 718         // that a similarity function comparing single Gaussians
 719         // modelling MFCCs for each track can perform as well as
 720         // mixture models.  A great advantage of using single
 721         // Gaussians is that a simple closed form exists for the KL
 722         // divergence." -- Mark Levy, "Lightweight measures for
 723         // timbral similarity of musical audio"
 724         // (http://www.elec.qmul.ac.uk/easaier/papers/mlevytimbralsimilarity.pdf)
 725
 726         KLDivergence kld;
 727
 728         for (int i = 0; i < m_channels; ++i) {
 729             for (int j = 0; j < m_channels; ++j) {
 730                 double d = kld.distanceGaussian(m[i], v[i], m[j], v[j]);
 731                 distances[i].push_back(d);
 732             }
 733         }
 734
 735     } else {
 736
 737         // We use the KL divergence for distributions of discrete
 738         // variables, as chroma are histograms already.  Or at least,
 739         // they will be when we've normalised them like this:
 740         for (int i = 0; i < m_channels; ++i) {
 741             MathUtilities::normalise(m[i], MathUtilities::NormaliseUnitSum);
 742         }
 743
 744         KLDivergence kld;
 745
 746         for (int i = 0; i < m_channels; ++i) {
 747             for (int j = 0; j < m_channels; ++j) {
 748                 double d = kld.distanceDistribution(m[i], m[j], true);
 749                 distances[i].push_back(d);
 750             }
 751         }
 752     }
 753
 754     Feature feature;
 755     feature.hasTimestamp = true;
 756
 757     char labelBuffer[100];
 758
 759     for (int i = 0; i < m_channels; ++i) {
 760
 761         feature.timestamp = Vamp::RealTime(i, 0);
 762
 763         sprintf(labelBuffer, "Means for channel %d", i+1);
 764         feature.label = labelBuffer;
 765
 766         feature.values.clear();
 767         for (int k = 0; k < m_featureColumnSize; ++k) {
 768             feature.values.push_back(m[i][k]);
 769         }
 770
 771         returnFeatures[m_meansOutput].push_back(feature);
 772
 773         sprintf(labelBuffer, "Variances for channel %d", i+1);
 774         feature.label = labelBuffer;
 775
 776         feature.values.clear();
 777         for (int k = 0; k < m_featureColumnSize; ++k) {
 778             feature.values.push_back(v[i][k]);
 779         }
 780
 781         returnFeatures[m_variancesOutput].push_back(feature);
 782     }
 783
 784     return distances;
 785 }
 786
 787 SimilarityPlugin::FeatureMatrix
 788 SimilarityPlugin::calculateRhythmic(FeatureSet &returnFeatures)
 789 {
 790     if (!needRhythm()) return FeatureMatrix();
 791
 792 //        std::cerr << "SimilarityPlugin::initialise: rhythm clip for channel 0 contains "
 793 //                  << m_rhythmValues[0].size() << " frames of size "
 794 //                  << m_rhythmClipFrameSize << " at process rate "
 795 //                  << m_processRate << " ( = "
 796 //                  << (float(m_rhythmValues[0].size() * m_rhythmClipFrameSize) / m_processRate) << " sec )"
 797 //                  << std::endl;
 798
 799     BeatSpectrum bscalc;
 800     CosineDistance cd;
 801
 802     // Our rhythm feature matrix is a deque of vectors for practical
 803     // reasons, but BeatSpectrum::process wants a vector of vectors
 804     // (which is what FeatureMatrix happens to be).
 805
 806     FeatureMatrixSet bsinput(m_channels);
 807     for (int i = 0; i < m_channels; ++i) {
 808         for (int j = 0; j < int(m_rhythmValues[i].size()); ++j) {
 809             bsinput[i].push_back(m_rhythmValues[i][j]);
 810         }
 811     }
 812
 813     FeatureMatrix bs(m_channels);
 814     for (int i = 0; i < m_channels; ++i) {
 815         bs[i] = bscalc.process(bsinput[i]);
 816     }
 817
 818     FeatureMatrix distances(m_channels);
 819     for (int i = 0; i < m_channels; ++i) {
 820         for (int j = 0; j < m_channels; ++j) {
 821             double d = cd.distance(bs[i], bs[j]);
 822             distances[i].push_back(d);
 823         }
 824     }
 825
 826     Feature feature;
 827     feature.hasTimestamp = true;
 828
 829     char labelBuffer[100];
 830
 831     for (int i = 0; i < m_channels; ++i) {
 832
 833         feature.timestamp = Vamp::RealTime(i, 0);
 834
 835         sprintf(labelBuffer, "Beat spectrum for channel %d", i+1);
 836         feature.label = labelBuffer;
 837
 838         feature.values.clear();
 839         for (int j = 0; j < int(bs[i].size()); ++j) {
 840             feature.values.push_back(bs[i][j]);
 841         }
 842
 843         returnFeatures[m_beatSpectraOutput].push_back(feature);
 844     }
 845
 846     return distances;
 847 }
 848
 849 double
 850 SimilarityPlugin::getDistance(const FeatureMatrix &timbral,
 851                               const FeatureMatrix &rhythmic,
 852                               int i, int j)
 853 {
 854     double distance = 1.0;
 855     if (needTimbre()) distance *= timbral[i][j];
 856     if (needRhythm()) distance *= rhythmic[i][j];
 857     return distance;
 858 }
 859
 860 SimilarityPlugin::FeatureSet
 861 SimilarityPlugin::getRemainingFeatures()
 862 {
 863     FeatureSet returnFeatures;
 864
 865     // We want to return a matrix of the distances between channels,
 866     // but Vamp doesn't have a matrix return type so we will actually
 867     // return a series of vectors
 868
 869     FeatureMatrix timbralDistances, rhythmicDistances;
 870
 871     if (needTimbre()) {
 872         timbralDistances = calculateTimbral(returnFeatures);
 873     }
 874
 875     if (needRhythm()) {
 876         rhythmicDistances = calculateRhythmic(returnFeatures);
 877     }
 878
 879     // We give all features a timestamp, otherwise hosts will tend to
 880     // stamp them at the end of the file, which is annoying
 881
 882     Feature feature;
 883     feature.hasTimestamp = true;
 884
 885     Feature distanceVectorFeature;
 886     distanceVectorFeature.label = "Distance from first channel";
 887     distanceVectorFeature.hasTimestamp = true;
 888     distanceVectorFeature.timestamp = Vamp::RealTime::zeroTime;
 889
 890     std::map<double, int> sorted;
 891
 892     char labelBuffer[100];
 893
 894     for (int i = 0; i < m_channels; ++i) {
 895
 896         feature.timestamp = Vamp::RealTime(i, 0);
 897
 898         feature.values.clear();
 899         for (int j = 0; j < m_channels; ++j) {
 900             double dist = getDistance(timbralDistances, rhythmicDistances, i, j);
 901             feature.values.push_back(dist);
 902         }
 903
 904         sprintf(labelBuffer, "Distances from channel %d", i+1);
 905         feature.label = labelBuffer;
 906
 907         returnFeatures[m_distanceMatrixOutput].push_back(feature);
 908
 909         double fromFirst =
 910             getDistance(timbralDistances, rhythmicDistances, 0, i);
 911
 912         distanceVectorFeature.values.push_back(fromFirst);
 913         sorted[fromFirst] = i;
 914     }
 915
 916     returnFeatures[m_distanceVectorOutput].push_back(distanceVectorFeature);
 917
 918     feature.label = "Order of channels by similarity to first channel";
 919     feature.values.clear();
 920     feature.timestamp = Vamp::RealTime(0, 0);
 921
 922     for (std::map<double, int>::iterator i = sorted.begin();
 923          i != sorted.end(); ++i) {
 924         feature.values.push_back(i->second + 1);
 925     }
 926
 927     returnFeatures[m_sortedVectorOutput].push_back(feature);
 928
 929     feature.label = "Ordered distances of channels from first channel";
 930     feature.values.clear();
 931     feature.timestamp = Vamp::RealTime(1, 0);
 932
 933     for (std::map<double, int>::iterator i = sorted.begin();
 934          i != sorted.end(); ++i) {
 935         feature.values.push_back(i->first);
 936     }
 937
 938     returnFeatures[m_sortedVectorOutput].push_back(feature);
 939
 940     return returnFeatures;
 941 }