Skip to content

MFCC and Mel Spectrogram

Tim Sharii edited this page Dec 27, 2021 · 8 revisions

MFCC

NWaves and librosa / PyTorch (default, HTK filterbank)

Scheme:

  1. Spectrum: power spectrum (default) / magnitude spectrum
  2. Mel Filterbank: HTK-like (default in PyTorch)
  3. Nonlinearity: 10*Log10()
  4. DCT: 2N (default)

PyTorch:

n_fft = 1024
win_length = None
hop_length = 512
n_mels = 24
n_mfcc = 13

mfcc_transform = T.MFCC(
    sample_rate=sample_rate,
    n_mfcc=n_mfcc,
    melkwargs={
      'n_fft': n_fft,
      'n_mels': n_mels,
      'hop_length': hop_length,
      'center': False
    }
)

mfcc = mfcc_transform(signal)

librosa:

# unfortunately, you'll need to call two methods separately
# (because of two different meanings of the 'norm' parameter)

melspec = librosa.feature.melspectrogram(
  y=waveform.numpy()[0], sr=sample_rate, n_fft=n_fft,
  win_length=win_length, hop_length=hop_length,
  n_mels=n_mels, htk=True, center=False, norm=None)

mfcc_librosa = librosa.feature.mfcc(
  S=librosa.core.spectrum.power_to_db(melspec),
  n_mfcc=n_mfcc, dct_type=2, norm='ortho')

NWaves:

var fftSize = 1024;
var melCount = 24;

var mfccOptions = new MfccOptions
{
    SamplingRate = samplingRate,
    FeatureCount = 13,
    FrameSize = fftSize,
    HopSize = 512,
    FilterBankSize = melCount,
    SpectrumType = SpectrumType.Power,
    NonLinearity = NonLinearityType.ToDecibel,
    DctType = "2N",
    FftSize = fftSize,
    Window = WindowType.Hann,
    LogFloor = 1e-10f
};

var extractor = new MfccExtractor(mfccOptions);
var mfccVectors = extractor.ComputeFrom(signal);
// or
// var mfccVectors = extractor.ParallelComputeFrom(signal);

If your calculations are based on method librosa.feature.mfcc like this:

librosa.feature.mfcc(
    signal,
    sample_rate,
    n_mfcc=13,
    dct_type=2,
    norm='ortho',
    window='hann',
    htk=True,
    n_mels=24,
    n_fft=1024,
    hop_length=512,
    center=False)

and you want a full compliance with it, you can specifiy the following settings in NWaves:

var fftSize = 1024;
var melCount = 24;

var freqs = FilterBanks.MelBands(melCount, samplingRate);
var fbank = FilterBanks.Triangular(fftSize, samplingRate, freqs);
FilterBanks.Normalize(melCount, freqs, fbank);

var mfccOptions = new MfccOptions
{
    SamplingRate = samplingRate,
    FeatureCount = 13,
    FrameSize = fftSize,
    HopSize = 512,
    FilterBank = fbank,
    SpectrumType = SpectrumType.Power,
    NonLinearity = NonLinearityType.ToDecibel,
    DctType = "2N",
    FftSize = fftSize,
    Window = WindowType.Hann,
    LogFloor = 1e-10f
};

var extractor = new MfccExtractor(mfccOptions);
var mfccVectors = extractor.ComputeFrom(signal);
// or
// var mfccVectors = extractor.ParallelComputeFrom(signal);

NWaves and librosa (default, Slaney mel-filterbank) / PyTorch

Scheme:

  1. Spectrum: power spectrum (default) / magnitude spectrum
  2. Mel Filterbank: Slaney (default in librosa)
  3. Nonlinearity: 10*Log10()
  4. DCT: 2N (default)

librosa (htk=False, i.e. Slaney mel-filterbank):

librosa.feature.mfcc(
    signal,
    sample_rate,
    n_mfcc=13,
    dct_type=2,
    norm='ortho',
    window='hann',
    htk=False,
    n_mels=40,
    n_fft=1024,
    hop_length=512,
    center=False)

PyTorch:

n_fft = 1024
win_length = None
hop_length = 512
n_mels = 40
n_mfcc = 13

mfcc_transform = T.MFCC(
    sample_rate=sample_rate,
    n_mfcc=n_mfcc,
    melkwargs={
      'n_fft': n_fft,
      'n_mels': n_mels,
      'hop_length': hop_length,
      'mel_scale': 'slaney',
      'norm': 'slaney',
      'center': False
    }
)

mfcc = mfcc_transform(signal)

NWaves:

var fftSize = 1024;
var melCount = 40;

var mfccOptions = new MfccOptions
{
    SamplingRate = samplingRate,
    FeatureCount = 13,
    FrameSize = fftSize,
    HopSize = 512,
    FilterBank = FilterBanks.MelBankSlaney(melCount, fftSize, samplingRate),
    SpectrumType = SpectrumType.Power,
    NonLinearity = NonLinearityType.ToDecibel,
    DctType = "2N",
    FftSize = fftSize,
    Window = WindowType.Hann,
    LogFloor = 1e-10f
};

var extractor = new MfccExtractor(mfccOptions);
var mfccVectors = extractor.ComputeFrom(signal);
// or
// var mfccVectors = extractor.ParallelComputeFrom(signal);

NWaves and Kaldi / PyTorch.kaldi.compliance

Scheme:

  1. Spectrum: power spectrum
  2. Mel Filterbank: HTK-like
  3. Nonlinearity: Ln()
  4. DCT: 2N
  5. and a lot of other parameters
# some parameters are omitted
# since in most cases their default values are used

torchaudio.compliance.kaldi.mfcc(
    signal,
    sample_frequency, 
    num_ceps = 13,
    num_mel_bins = 23,
    frame_length = 25.0,
    frame_shift = 10.0,
    low_freq = 20.0,
    high_freq = 0.0,
    window_type = 'hanning',
    cepstral_lifter = 22.0,
    preemphasis_coefficient = 0.97,
    remove_dc_offset = False,
    use_energy = False
)

Let's consider two cases separately:

  • remove_dc_offset=False
  • remove_dc_offset=True

In the former case there's no HTK-specific preprocessing of frames, so NWaves settings are simple:

var mfccOptions = new MfccOptions
{
    SamplingRate = samplingRate,
    FeatureCount = 13,
    FrameDuration = 0.025,
    HopDuration = 0.01,
    FilterBankSize = 23,
    LowFrequency = 20,
    NonLinearity = NonLinearityType.LogE,
    DctType = "2N",
    Window = WindowType.Hann,
    PreEmphasis = 0.97,
    LifterSize = 22
};

var extractor = new MfccExtractor(mfccOptions);
var mfccVectors = extractor.ComputeFrom(signal);
// or
// var mfccVectors = extractor.ParallelComputeFrom(signal);

In the latter case things get trickier. Use the following subclass of MfccExtractor:

    class MfccExtractorKaldi : MfccExtractor
    {
        private readonly new float[] _windowSamples;
        private readonly float _pre;

        public MfccExtractorKaldi(MfccOptions options, WindowType window) : base(options)
        {
            _windowSamples = Window.OfType(window, FrameSize);
            _preEmphasis = 0;
            _pre = (float)options.PreEmphasis;
        }

        public override void ProcessFrame(float[] block, float[] features)
        {
            var mean = block.Take(FrameSize).Average();

            for (var i = 0; i < FrameSize; i++)
            {
                block[i] -= mean;
            }

            for (var k = FrameSize - 1; k >= 1; k--)
            {
                block[k] -= block[k - 1] * _pre;
            }
            block[0] *= 1 - _pre;

            block.ApplyWindow(_windowSamples);

            base.ProcessFrame(block, features);
        }
    }

Pay attention to the constructor: its second parameter is the window type. In MFCC options the rectangular window must be specified:

var mfccOptions = new MfccOptions
{
    SamplingRate = samplingRate,
    FeatureCount = 13,
    FrameDuration = 0.025,
    HopDuration = 0.01,
    FilterBankSize = 23,
    LowFrequency = 20,
    NonLinearity = NonLinearityType.LogE,
    DctType = "2N",
    Window = WindowType.Rectangular, // !
    PreEmphasis = 0.97,
    LifterSize = 22
};

var extractor = new MfccExtractorKaldi(mfccOptions, WindowType.Hann);
var mfccVectors = extractor.ComputeFrom(signal);
// or
// var mfccVectors = extractor.ParallelComputeFrom(signal);

NWaves and python_speech_features

Scheme:

  1. Spectrum: power spectrum (normalized)
  2. Mel Filterbank: essentially HTK-like (but slightly different)
  3. Nonlinearity: Ln()
  4. DCT: 2N
# s = s / 32768   # if samples are not in range [-1, 1]
                  # (or don't normalize signal in NWaves)

mfcc(
    signal,
    samplerate=16000,
    winlen=0.025,
    winstep=0.01,
    numcep=16,
    nfilt=26,
    nfft=512,
    lowfreq=0,
    highfreq=None,
    preemph=0.97,
    ceplifter=22,
    appendEnergy=False,
    winfunc=numpy.hanning
)

NWaves:

var samplingRate = 16000;
var fftSize = 512;
var melCount = 26;

var mfccOptions = new MfccOptions
{
    SamplingRate = samplingRate,
    FeatureCount = 16,
    FrameDuration = 0.025,
    HopDuration = 0.01,
    FilterBankSize = melCount,
    SpectrumType = SpectrumType.PowerNormalized,
    NonLinearity = NonLinearityType.LogE,
    DctType = "2N",
    FftSize = fftSize,
    PreEmphasis = 0.97,
    LifterSize = 22,
    IncludeEnergy = false,
    Window = WindowType.Hann,
};

For complete compliance with python_speech_features:

  1. Correction for the first coefficient:
  • if appendEnergy=False (IncludeEnergy = false)
// call this on already computed mfccVectors:

for (var i = 0; i < mfccVectors.Count; i++)
{
    mfccVectors[i][0] -= (float)(Math.Log(2) * Math.Sqrt(melCount));
}
  • if appendEnergy=True (IncludeEnergy = true)
// call this on already computed mfccVectors:

for (var i = 0; i < mfccVectors.Count; i++)
{
    mfccVectors[i][0] -= (float)Math.Log(2);
}
  1. Set the filterbank explicitly (instead of FilterBankSize = melCount)

FilterBank = PsfFilterbank(samplingRate, melCount, fftSize).

The code of PsfFilterbank function:

/// <summary>
/// Generates filterbank with weights identical to python_speech_features.
/// </summary>
float[][] PsfFilterbank(int samplingRate, int filterbankSize, int fftSize, double lowFreq = 0, double highFreq = 0)
{
    var filterbank = new float[filterbankSize][];

    if (highFreq <= lowFreq)
    {
        highFreq = samplingRate / 2;
    }

    var low = NWaves.Utils.Scale.HerzToMel(lowFreq);
    var high = NWaves.Utils.Scale.HerzToMel(highFreq);

    var res = (fftSize + 1) / (float)samplingRate;

    var bins = Enumerable
                  .Range(0, filterbankSize + 2)
                  .Select(i => (float)Math.Floor(res * NWaves.Utils.Scale.MelToHerz(low + i * (high - low) / (filterbankSize + 1))))
                  .ToArray();

    for (var i = 0; i < filterbankSize; i++)
    {
        filterbank[i] = new float[fftSize / 2 + 1];

        for (var j = (int)bins[i]; j < (int)bins[i + 1]; j++)
        {
            filterbank[i][j] = (j - bins[i]) / (bins[i + 1] - bins[i]);
        }
        for (var j = (int)bins[i + 1]; j < (int)bins[i + 2]; j++)
        {
            filterbank[i][j] = (bins[i + 2] - j) / (bins[i + 2] - bins[i + 1]);
        }
    }

    return filterbank;
}

Mel-Spectrogram

pyTorch:

import torchaudio.transforms as T

n_fft = 1024
win_length = None
hop_length = 512
n_mels = 40

mel_spectrogram = T.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
    power=2.0,
    n_mels=n_mels,
    center=False,
    norm=None,        # or 'slaney' if in librosa norm='slaney'
    mel_scale='htk'   # or 'slaney' if in librosa htk=False
)

melspec = mel_spectrogram(waveform)

librosa:

n_fft = 1024
win_length = None
hop_length = 512
n_mels = 40

melspec_librosa = librosa.feature.melspectrogram(
    signal,
    sr=sample_rate,
    n_fft=n_fft,
    hop_length=hop_length,
    win_length=win_length,
    center=False,
    power=2.0,
    n_mels=n_mels,
    norm=None,
    htk=True,
)

NWaves:

var fftSize = 1024;
var hopSize = 512;
var melCount = 40;

var mfccExtractor = new FilterbankExtractor(
   new FilterbankOptions
   {
       SamplingRate = samplingRate,
       FrameSize = fftSize,
       FftSize = fftSize,
       HopSize = hopSize,
       Window = WindowType.Hann,
       FilterBank = FilterBanks.Triangular(fftSize, samplingRate, FilterBanks.MelBands(melCount, samplingRate)),

       // if power = 1.0
       // SpectrumType = SpectrumType.Magnitude
   });

Frame centering in librosa

Where possible, just set center=False in librosa.

In librosa center=True means that the input signal is padded so that frame t is centered at y[t * hop_length]. Hence, this parameter does not affect MFCC algorithm per se, it simply changes the offset of frames. The simplest solution in NWaves is to set the signal offset for computations and skip first MFCC vectors in librosa (computed from padded part of the signal). The offset can be calculated as follows:

var offset = hopSize - (fftSize2 / 2) % hopSize;

var mfccVectors = extractor.ComputeFrom(signal, offset, signal.Length);
// var mfccVectors = extractor.ParallelComputeFrom(signal, offset, signal.Length);

// for 1-to-1 comparison, skip the first skipCount vectors in librosa results:
// var skipCount = (offset + fftSize / 2) / hopSize;

// for example:
// fftSize = 1024
// hopSize = 400

// offset = 288
// skipCount = 2

// i.e. mfccVectors[0] corresponds to librosa_mfcc[:, 2]
//      mfccVectors[1] corresponds to librosa_mfcc[:, 3]
//      etc.