Update audio_processing module

Corresponds to upstream commit 524e9b043e7e86fd72353b987c9d5f6a1ebf83e1

Update notes:

 * Pull in third party license file

 * Replace .gypi files with BUILD.gn to keep track of what changes
   upstream

 * Bunch of new filse pulled in as dependencies

 * Won't build yet due to changes needed on top of these
This commit is contained in:
Arun Raghavan
2015-10-13 17:25:22 +05:30
parent 5ae7a5d6cd
commit 753eada3aa
324 changed files with 52533 additions and 16117 deletions

View File

@ -0,0 +1,27 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_COMMON_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_COMMON_H_
static const int kSampleRateHz = 16000;
static const size_t kLength10Ms = kSampleRateHz / 100;
static const size_t kMaxNumFrames = 4;
struct AudioFeatures {
double log_pitch_gain[kMaxNumFrames];
double pitch_lag_hz[kMaxNumFrames];
double spectral_peak[kMaxNumFrames];
double rms[kMaxNumFrames];
size_t num_frames;
bool silence;
};
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_COMMON_H_

View File

@ -0,0 +1,64 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_processing/vad/gmm.h"
#include <math.h>
#include <stdlib.h>
#include "webrtc/typedefs.h"
namespace webrtc {
static const int kMaxDimension = 10;
static void RemoveMean(const double* in,
const double* mean_vec,
int dimension,
double* out) {
for (int n = 0; n < dimension; ++n)
out[n] = in[n] - mean_vec[n];
}
static double ComputeExponent(const double* in,
const double* covar_inv,
int dimension) {
double q = 0;
for (int i = 0; i < dimension; ++i) {
double v = 0;
for (int j = 0; j < dimension; j++)
v += (*covar_inv++) * in[j];
q += v * in[i];
}
q *= -0.5;
return q;
}
double EvaluateGmm(const double* x, const GmmParameters& gmm_parameters) {
if (gmm_parameters.dimension > kMaxDimension) {
return -1; // This is invalid pdf so the caller can check this.
}
double f = 0;
double v[kMaxDimension];
const double* mean_vec = gmm_parameters.mean;
const double* covar_inv = gmm_parameters.covar_inverse;
for (int n = 0; n < gmm_parameters.num_mixtures; n++) {
RemoveMean(x, mean_vec, gmm_parameters.dimension, v);
double q = ComputeExponent(v, covar_inv, gmm_parameters.dimension) +
gmm_parameters.weight[n];
f += exp(q);
mean_vec += gmm_parameters.dimension;
covar_inv += gmm_parameters.dimension * gmm_parameters.dimension;
}
return f;
}
} // namespace webrtc

View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_GMM_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_GMM_H_
namespace webrtc {
// A structure that specifies a GMM.
// A GMM is formulated as
// f(x) = w[0] * mixture[0] + w[1] * mixture[1] + ... +
// w[num_mixtures - 1] * mixture[num_mixtures - 1];
// Where a 'mixture' is a Gaussian density.
struct GmmParameters {
// weight[n] = log(w[n]) - |dimension|/2 * log(2*pi) - 1/2 * log(det(cov[n]));
// where cov[n] is the covariance matrix of mixture n;
const double* weight;
// pointer to the first element of a |num_mixtures|x|dimension| matrix
// where kth row is the mean of the kth mixture.
const double* mean;
// pointer to the first element of a |num_mixtures|x|dimension|x|dimension|
// 3D-matrix, where the kth 2D-matrix is the inverse of the covariance
// matrix of the kth mixture.
const double* covar_inverse;
// Dimensionality of the mixtures.
int dimension;
// number of the mixtures.
int num_mixtures;
};
// Evaluate the given GMM, according to |gmm_parameters|, at the given point
// |x|. If the dimensionality of the given GMM is larger that the maximum
// acceptable dimension by the following function -1 is returned.
double EvaluateGmm(const double* x, const GmmParameters& gmm_parameters);
} // namespace webrtc
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_GMM_H_

View File

@ -0,0 +1,85 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
// GMM tables for inactive segments. Generated by MakeGmmTables.m.
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_NOISE_GMM_TABLES_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_NOISE_GMM_TABLES_H_
static const int kNoiseGmmNumMixtures = 12;
static const int kNoiseGmmDim = 3;
static const double
kNoiseGmmCovarInverse[kNoiseGmmNumMixtures][kNoiseGmmDim][kNoiseGmmDim] = {
{{7.36219567592941e+00, 4.83060785179861e-03, 1.23335151497610e-02},
{4.83060785179861e-03, 1.65289507047817e-04, -2.41490588169997e-04},
{1.23335151497610e-02, -2.41490588169997e-04, 6.59472060689382e-03}},
{{8.70265239309140e+00, -5.30636201431086e-04, 5.44014966585347e-03},
{-5.30636201431086e-04, 3.11095453521008e-04, -1.86287206836035e-04},
{5.44014966585347e-03, -1.86287206836035e-04, 6.29493388790744e-04}},
{{4.53467851955055e+00, -3.92977536695197e-03, -2.46521420693317e-03},
{-3.92977536695197e-03, 4.94650752632750e-05, -1.08587438501826e-05},
{-2.46521420693317e-03, -1.08587438501826e-05, 9.28793975422261e-05}},
{{9.26817997114275e-01, -4.03976069276753e-04, -3.56441427392165e-03},
{-4.03976069276753e-04, 2.51976251631430e-06, 1.46914206734572e-07},
{-3.56441427392165e-03, 1.46914206734572e-07, 8.19914567685373e-05}},
{{7.61715986787441e+00, -1.54889041216888e-04, 2.41756280071656e-02},
{-1.54889041216888e-04, 3.50282550461672e-07, -6.27251196972490e-06},
{2.41756280071656e-02, -6.27251196972490e-06, 1.45061847649872e-02}},
{{8.31193642663158e+00, -3.84070508164323e-04, -3.09750630821876e-02},
{-3.84070508164323e-04, 3.80433432277336e-07, -1.14321142836636e-06},
{-3.09750630821876e-02, -1.14321142836636e-06, 8.35091486289997e-04}},
{{9.67283151270894e-01, 5.82465812445039e-05, -3.18350798617053e-03},
{5.82465812445039e-05, 2.23762672000318e-07, -7.74196587408623e-07},
{-3.18350798617053e-03, -7.74196587408623e-07, 3.85120938338325e-04}},
{{8.28066236985388e+00, 5.87634508319763e-05, 6.99303090891743e-03},
{5.87634508319763e-05, 2.93746018618058e-07, 3.40843332882272e-07},
{6.99303090891743e-03, 3.40843332882272e-07, 1.99379171190344e-04}},
{{6.07488998675646e+00, -1.11494526618473e-02, 5.10013111123381e-03},
{-1.11494526618473e-02, 6.99238879921751e-04, 5.36718550370870e-05},
{5.10013111123381e-03, 5.36718550370870e-05, 5.26909853276753e-04}},
{{6.90492021419175e+00, 4.20639355257863e-04, -2.38612752336481e-03},
{4.20639355257863e-04, 3.31246767338153e-06, -2.42052288150859e-08},
{-2.38612752336481e-03, -2.42052288150859e-08, 4.46608368363412e-04}},
{{1.31069150869715e+01, -1.73718583865670e-04, -1.97591814508578e-02},
{-1.73718583865670e-04, 2.80451716300124e-07, 9.96570755379865e-07},
{-1.97591814508578e-02, 9.96570755379865e-07, 2.41361900868847e-03}},
{{4.69566344239814e+00, -2.61077567563690e-04, 5.26359000761433e-03},
{-2.61077567563690e-04, 1.82420859823767e-06, -7.83645887541601e-07},
{5.26359000761433e-03, -7.83645887541601e-07, 1.33586288288802e-02}}};
static const double kNoiseGmmMean[kNoiseGmmNumMixtures][kNoiseGmmDim] = {
{-2.01386094766163e+00, 1.69702162045397e+02, 7.41715804872181e+01},
{-1.94684591777290e+00, 1.42398396732668e+02, 1.64186321157831e+02},
{-2.29319297562437e+00, 3.86415425589868e+02, 2.13452215267125e+02},
{-3.25487177070268e+00, 1.08668712553616e+03, 2.33119949467419e+02},
{-2.13159632447467e+00, 4.83821702557717e+03, 6.86786166673740e+01},
{-2.26171410780526e+00, 4.79420193982422e+03, 1.53222513286450e+02},
{-3.32166740703185e+00, 4.35161135834358e+03, 1.33206448431316e+02},
{-2.19290322814343e+00, 3.98325506609408e+03, 2.13249167359934e+02},
{-2.02898459255404e+00, 7.37039893155007e+03, 1.12518527491926e+02},
{-2.26150236399500e+00, 1.54896745196145e+03, 1.49717357868579e+02},
{-2.00417668301790e+00, 3.82434760310304e+03, 1.07438913004312e+02},
{-2.30193040814533e+00, 1.43953696546439e+03, 7.04085275122649e+01}};
static const double kNoiseGmmWeights[kNoiseGmmNumMixtures] = {
-1.09422832086193e+01,
-1.10847897513425e+01,
-1.36767587732187e+01,
-1.79789356118641e+01,
-1.42830169160894e+01,
-1.56500228061379e+01,
-1.83124990950113e+01,
-1.69979436177477e+01,
-1.12329424387828e+01,
-1.41311785780639e+01,
-1.47171861448585e+01,
-1.35963362781839e+01};
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_NOISE_GMM_TABLES_H_

View File

@ -0,0 +1,124 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"
#include <assert.h>
#include <math.h>
#include <string.h>
#include "webrtc/modules/audio_processing/vad/vad_circular_buffer.h"
#include "webrtc/modules/audio_processing/vad/common.h"
#include "webrtc/modules/audio_processing/vad/noise_gmm_tables.h"
#include "webrtc/modules/audio_processing/vad/voice_gmm_tables.h"
#include "webrtc/modules/interface/module_common_types.h"
namespace webrtc {
static_assert(kNoiseGmmDim == kVoiceGmmDim,
"noise and voice gmm dimension not equal");
// These values should match MATLAB counterparts for unit-tests to pass.
static const int kPosteriorHistorySize = 500; // 5 sec of 10 ms frames.
static const double kInitialPriorProbability = 0.3;
static const int kTransientWidthThreshold = 7;
static const double kLowProbabilityThreshold = 0.2;
static double LimitProbability(double p) {
const double kLimHigh = 0.99;
const double kLimLow = 0.01;
if (p > kLimHigh)
p = kLimHigh;
else if (p < kLimLow)
p = kLimLow;
return p;
}
PitchBasedVad::PitchBasedVad()
: p_prior_(kInitialPriorProbability),
circular_buffer_(VadCircularBuffer::Create(kPosteriorHistorySize)) {
// Setup noise GMM.
noise_gmm_.dimension = kNoiseGmmDim;
noise_gmm_.num_mixtures = kNoiseGmmNumMixtures;
noise_gmm_.weight = kNoiseGmmWeights;
noise_gmm_.mean = &kNoiseGmmMean[0][0];
noise_gmm_.covar_inverse = &kNoiseGmmCovarInverse[0][0][0];
// Setup voice GMM.
voice_gmm_.dimension = kVoiceGmmDim;
voice_gmm_.num_mixtures = kVoiceGmmNumMixtures;
voice_gmm_.weight = kVoiceGmmWeights;
voice_gmm_.mean = &kVoiceGmmMean[0][0];
voice_gmm_.covar_inverse = &kVoiceGmmCovarInverse[0][0][0];
}
PitchBasedVad::~PitchBasedVad() {
}
int PitchBasedVad::VoicingProbability(const AudioFeatures& features,
double* p_combined) {
double p;
double gmm_features[3];
double pdf_features_given_voice;
double pdf_features_given_noise;
// These limits are the same in matlab implementation 'VoicingProbGMM().'
const double kLimLowLogPitchGain = -2.0;
const double kLimHighLogPitchGain = -0.9;
const double kLimLowSpectralPeak = 200;
const double kLimHighSpectralPeak = 2000;
const double kEps = 1e-12;
for (size_t n = 0; n < features.num_frames; n++) {
gmm_features[0] = features.log_pitch_gain[n];
gmm_features[1] = features.spectral_peak[n];
gmm_features[2] = features.pitch_lag_hz[n];
pdf_features_given_voice = EvaluateGmm(gmm_features, voice_gmm_);
pdf_features_given_noise = EvaluateGmm(gmm_features, noise_gmm_);
if (features.spectral_peak[n] < kLimLowSpectralPeak ||
features.spectral_peak[n] > kLimHighSpectralPeak ||
features.log_pitch_gain[n] < kLimLowLogPitchGain) {
pdf_features_given_voice = kEps * pdf_features_given_noise;
} else if (features.log_pitch_gain[n] > kLimHighLogPitchGain) {
pdf_features_given_noise = kEps * pdf_features_given_voice;
}
p = p_prior_ * pdf_features_given_voice /
(pdf_features_given_voice * p_prior_ +
pdf_features_given_noise * (1 - p_prior_));
p = LimitProbability(p);
// Combine pitch-based probability with standalone probability, before
// updating prior probabilities.
double prod_active = p * p_combined[n];
double prod_inactive = (1 - p) * (1 - p_combined[n]);
p_combined[n] = prod_active / (prod_active + prod_inactive);
if (UpdatePrior(p_combined[n]) < 0)
return -1;
// Limit prior probability. With a zero prior probability the posterior
// probability is always zero.
p_prior_ = LimitProbability(p_prior_);
}
return 0;
}
int PitchBasedVad::UpdatePrior(double p) {
circular_buffer_->Insert(p);
if (circular_buffer_->RemoveTransient(kTransientWidthThreshold,
kLowProbabilityThreshold) < 0)
return -1;
p_prior_ = circular_buffer_->Mean();
return 0;
}
} // namespace webrtc

View File

@ -0,0 +1,57 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_BASED_VAD_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_BASED_VAD_H_
#include "webrtc/base/scoped_ptr.h"
#include "webrtc/modules/audio_processing/vad/common.h"
#include "webrtc/modules/audio_processing/vad/gmm.h"
#include "webrtc/typedefs.h"
namespace webrtc {
class AudioFrame;
class VadCircularBuffer;
// Computes the probability of the input audio frame to be active given
// the corresponding pitch-gain and lag of the frame.
class PitchBasedVad {
public:
PitchBasedVad();
~PitchBasedVad();
// Compute pitch-based voicing probability, given the features.
// features: a structure containing features required for computing voicing
// probabilities.
//
// p_combined: an array which contains the combined activity probabilities
// computed prior to the call of this function. The method,
// then, computes the voicing probabilities and combine them
// with the given values. The result are returned in |p|.
int VoicingProbability(const AudioFeatures& features, double* p_combined);
private:
int UpdatePrior(double p);
// TODO(turajs): maybe defining this at a higher level (maybe enum) so that
// all the code recognize it as "no-error."
static const int kNoError = 0;
GmmParameters noise_gmm_;
GmmParameters voice_gmm_;
double p_prior_;
rtc::scoped_ptr<VadCircularBuffer> circular_buffer_;
};
} // namespace webrtc
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_BASED_VAD_H_

View File

@ -0,0 +1,51 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_processing/vad/pitch_internal.h"
#include <cmath>
// A 4-to-3 linear interpolation.
// The interpolation constants are derived as following:
// Input pitch parameters are updated every 7.5 ms. Within a 30-ms interval
// we are interested in pitch parameters of 0-5 ms, 10-15ms and 20-25ms. This is
// like interpolating 4-to-6 and keep the odd samples.
// The reason behind this is that LPC coefficients are computed for the first
// half of each 10ms interval.
static void PitchInterpolation(double old_val, const double* in, double* out) {
out[0] = 1. / 6. * old_val + 5. / 6. * in[0];
out[1] = 5. / 6. * in[1] + 1. / 6. * in[2];
out[2] = 0.5 * in[2] + 0.5 * in[3];
}
void GetSubframesPitchParameters(int sampling_rate_hz,
double* gains,
double* lags,
int num_in_frames,
int num_out_frames,
double* log_old_gain,
double* old_lag,
double* log_pitch_gain,
double* pitch_lag_hz) {
// Gain interpolation is in log-domain, also returned in log-domain.
for (int n = 0; n < num_in_frames; n++)
gains[n] = log(gains[n] + 1e-12);
// Interpolate lags and gains.
PitchInterpolation(*log_old_gain, gains, log_pitch_gain);
*log_old_gain = gains[num_in_frames - 1];
PitchInterpolation(*old_lag, lags, pitch_lag_hz);
*old_lag = lags[num_in_frames - 1];
// Convert pitch-lags to Hertz.
for (int n = 0; n < num_out_frames; n++) {
pitch_lag_hz[n] = (sampling_rate_hz) / (pitch_lag_hz[n]);
}
}

View File

@ -0,0 +1,26 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_INTERNAL_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_INTERNAL_H_
// TODO(turajs): Write a description of this function. Also be consistent with
// usage of |sampling_rate_hz| vs |kSamplingFreqHz|.
void GetSubframesPitchParameters(int sampling_rate_hz,
double* gains,
double* lags,
int num_in_frames,
int num_out_frames,
double* log_old_gain,
double* old_lag,
double* log_pitch_gain,
double* pitch_lag_hz);
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_INTERNAL_H_

View File

@ -0,0 +1,106 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"
#include <stdlib.h>
#include <string.h>
#include <algorithm>
namespace webrtc {
PoleZeroFilter* PoleZeroFilter::Create(const float* numerator_coefficients,
size_t order_numerator,
const float* denominator_coefficients,
size_t order_denominator) {
if (order_numerator > kMaxFilterOrder ||
order_denominator > kMaxFilterOrder || denominator_coefficients[0] == 0 ||
numerator_coefficients == NULL || denominator_coefficients == NULL)
return NULL;
return new PoleZeroFilter(numerator_coefficients, order_numerator,
denominator_coefficients, order_denominator);
}
PoleZeroFilter::PoleZeroFilter(const float* numerator_coefficients,
size_t order_numerator,
const float* denominator_coefficients,
size_t order_denominator)
: past_input_(),
past_output_(),
numerator_coefficients_(),
denominator_coefficients_(),
order_numerator_(order_numerator),
order_denominator_(order_denominator),
highest_order_(std::max(order_denominator, order_numerator)) {
memcpy(numerator_coefficients_, numerator_coefficients,
sizeof(numerator_coefficients_[0]) * (order_numerator_ + 1));
memcpy(denominator_coefficients_, denominator_coefficients,
sizeof(denominator_coefficients_[0]) * (order_denominator_ + 1));
if (denominator_coefficients_[0] != 1) {
for (size_t n = 0; n <= order_numerator_; n++)
numerator_coefficients_[n] /= denominator_coefficients_[0];
for (size_t n = 0; n <= order_denominator_; n++)
denominator_coefficients_[n] /= denominator_coefficients_[0];
}
}
template <typename T>
static float FilterArPast(const T* past, size_t order,
const float* coefficients) {
float sum = 0.0f;
size_t past_index = order - 1;
for (size_t k = 1; k <= order; k++, past_index--)
sum += coefficients[k] * past[past_index];
return sum;
}
int PoleZeroFilter::Filter(const int16_t* in,
size_t num_input_samples,
float* output) {
if (in == NULL || output == NULL)
return -1;
// This is the typical case, just a memcpy.
const size_t k = std::min(num_input_samples, highest_order_);
size_t n;
for (n = 0; n < k; n++) {
output[n] = in[n] * numerator_coefficients_[0];
output[n] += FilterArPast(&past_input_[n], order_numerator_,
numerator_coefficients_);
output[n] -= FilterArPast(&past_output_[n], order_denominator_,
denominator_coefficients_);
past_input_[n + order_numerator_] = in[n];
past_output_[n + order_denominator_] = output[n];
}
if (highest_order_ < num_input_samples) {
for (size_t m = 0; n < num_input_samples; n++, m++) {
output[n] = in[n] * numerator_coefficients_[0];
output[n] +=
FilterArPast(&in[m], order_numerator_, numerator_coefficients_);
output[n] -= FilterArPast(&output[m], order_denominator_,
denominator_coefficients_);
}
// Record into the past signal.
memcpy(past_input_, &in[num_input_samples - order_numerator_],
sizeof(in[0]) * order_numerator_);
memcpy(past_output_, &output[num_input_samples - order_denominator_],
sizeof(output[0]) * order_denominator_);
} else {
// Odd case that the length of the input is shorter that filter order.
memmove(past_input_, &past_input_[num_input_samples],
order_numerator_ * sizeof(past_input_[0]));
memmove(past_output_, &past_output_[num_input_samples],
order_denominator_ * sizeof(past_output_[0]));
}
return 0;
}
} // namespace webrtc

View File

@ -0,0 +1,52 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_POLE_ZERO_FILTER_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_POLE_ZERO_FILTER_H_
#include <cstddef>
#include "webrtc/typedefs.h"
namespace webrtc {
class PoleZeroFilter {
public:
~PoleZeroFilter() {}
static PoleZeroFilter* Create(const float* numerator_coefficients,
size_t order_numerator,
const float* denominator_coefficients,
size_t order_denominator);
int Filter(const int16_t* in, size_t num_input_samples, float* output);
private:
PoleZeroFilter(const float* numerator_coefficients,
size_t order_numerator,
const float* denominator_coefficients,
size_t order_denominator);
static const int kMaxFilterOrder = 24;
int16_t past_input_[kMaxFilterOrder * 2];
float past_output_[kMaxFilterOrder * 2];
float numerator_coefficients_[kMaxFilterOrder + 1];
float denominator_coefficients_[kMaxFilterOrder + 1];
size_t order_numerator_;
size_t order_denominator_;
size_t highest_order_;
};
} // namespace webrtc
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_POLE_ZERO_FILTER_H_

View File

@ -0,0 +1,93 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_processing/vad/standalone_vad.h"
#include <assert.h>
#include "webrtc/modules/interface/module_common_types.h"
#include "webrtc/modules/utility/interface/audio_frame_operations.h"
#include "webrtc/typedefs.h"
namespace webrtc {
static const int kDefaultStandaloneVadMode = 3;
StandaloneVad::StandaloneVad(VadInst* vad)
: vad_(vad), buffer_(), index_(0), mode_(kDefaultStandaloneVadMode) {
}
StandaloneVad::~StandaloneVad() {
WebRtcVad_Free(vad_);
}
StandaloneVad* StandaloneVad::Create() {
VadInst* vad = WebRtcVad_Create();
if (!vad)
return nullptr;
int err = WebRtcVad_Init(vad);
err |= WebRtcVad_set_mode(vad, kDefaultStandaloneVadMode);
if (err != 0) {
WebRtcVad_Free(vad);
return nullptr;
}
return new StandaloneVad(vad);
}
int StandaloneVad::AddAudio(const int16_t* data, size_t length) {
if (length != kLength10Ms)
return -1;
if (index_ + length > kLength10Ms * kMaxNum10msFrames)
// Reset the buffer if it's full.
// TODO(ajm): Instead, consider just processing every 10 ms frame. Then we
// can forgo the buffering.
index_ = 0;
memcpy(&buffer_[index_], data, sizeof(int16_t) * length);
index_ += length;
return 0;
}
int StandaloneVad::GetActivity(double* p, size_t length_p) {
if (index_ == 0)
return -1;
const size_t num_frames = index_ / kLength10Ms;
if (num_frames > length_p)
return -1;
assert(WebRtcVad_ValidRateAndFrameLength(kSampleRateHz, index_) == 0);
int activity = WebRtcVad_Process(vad_, kSampleRateHz, buffer_, index_);
if (activity < 0)
return -1;
else if (activity == 0)
p[0] = 0.01; // Arbitrary but small and non-zero.
else
p[0] = 0.5; // 0.5 is neutral values when combinned by other probabilities.
for (size_t n = 1; n < num_frames; n++)
p[n] = p[0];
// Reset the buffer to start from the beginning.
index_ = 0;
return activity;
}
int StandaloneVad::set_mode(int mode) {
if (mode < 0 || mode > 3)
return -1;
if (WebRtcVad_set_mode(vad_, mode) != 0)
return -1;
mode_ = mode;
return 0;
}
} // namespace webrtc

View File

@ -0,0 +1,70 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AGC_STANDALONE_VAD_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_STANDALONE_VAD_H_
#include "webrtc/base/scoped_ptr.h"
#include "webrtc/modules/audio_processing/vad/common.h"
#include "webrtc/common_audio/vad/include/webrtc_vad.h"
#include "webrtc/typedefs.h"
namespace webrtc {
class AudioFrame;
class StandaloneVad {
public:
static StandaloneVad* Create();
~StandaloneVad();
// Outputs
// p: a buffer where probabilities are written to.
// length_p: number of elements of |p|.
//
// return value:
// -1: if no audio is stored or VAD returns error.
// 0: in success.
// In case of error the content of |activity| is unchanged.
//
// Note that due to a high false-positive (VAD decision is active while the
// processed audio is just background noise) rate, stand-alone VAD is used as
// a one-sided indicator. The activity probability is 0.5 if the frame is
// classified as active, and the probability is 0.01 if the audio is
// classified as passive. In this way, when probabilities are combined, the
// effect of the stand-alone VAD is neutral if the input is classified as
// active.
int GetActivity(double* p, size_t length_p);
// Expecting 10 ms of 16 kHz audio to be pushed in.
int AddAudio(const int16_t* data, size_t length);
// Set aggressiveness of VAD, 0 is the least aggressive and 3 is the most
// aggressive mode. Returns -1 if the input is less than 0 or larger than 3,
// otherwise 0 is returned.
int set_mode(int mode);
// Get the agressiveness of the current VAD.
int mode() const { return mode_; }
private:
explicit StandaloneVad(VadInst* vad);
static const size_t kMaxNum10msFrames = 3;
// TODO(turajs): Is there a way to use scoped-pointer here?
VadInst* vad_;
int16_t buffer_[kMaxNum10msFrames * kLength10Ms];
size_t index_;
int mode_;
};
} // namespace webrtc
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AGC_STANDALONE_VAD_H_

View File

@ -0,0 +1,275 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
#include <math.h>
#include <stdio.h>
#include "webrtc/common_audio/fft4g.h"
#include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h"
#include "webrtc/modules/audio_processing/vad/pitch_internal.h"
#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"
extern "C" {
#include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"
#include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"
#include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
#include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"
}
#include "webrtc/modules/interface/module_common_types.h"
namespace webrtc {
// The following structures are declared anonymous in iSAC's structs.h. To
// forward declare them, we use this derived class trick.
struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
static const float kFrequencyResolution =
kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);
static const int kSilenceRms = 5;
// TODO(turajs): Make a Create or Init for VadAudioProc.
VadAudioProc::VadAudioProc()
: audio_buffer_(),
num_buffer_samples_(kNumPastSignalSamples),
log_old_gain_(-2),
old_lag_(50), // Arbitrary but valid as pitch-lag (in samples).
pitch_analysis_handle_(new PitchAnalysisStruct),
pre_filter_handle_(new PreFiltBankstr),
high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,
kFilterOrder,
kCoeffDenominator,
kFilterOrder)) {
static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
"lpc analysis window incorrect size");
static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
"correlation weight incorrect size");
// TODO(turajs): Are we doing too much in the constructor?
float data[kDftSize];
// Make FFT to initialize.
ip_[0] = 0;
WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
// TODO(turajs): Need to initialize high-pass filter.
// Initialize iSAC components.
WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
}
VadAudioProc::~VadAudioProc() {
}
void VadAudioProc::ResetBuffer() {
memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
num_buffer_samples_ = kNumPastSignalSamples;
}
int VadAudioProc::ExtractFeatures(const int16_t* frame,
size_t length,
AudioFeatures* features) {
features->num_frames = 0;
if (length != kNumSubframeSamples) {
return -1;
}
// High-pass filter to remove the DC component and very low frequency content.
// We have experienced that this high-pass filtering improves voice/non-voiced
// classification.
if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
&audio_buffer_[num_buffer_samples_]) != 0) {
return -1;
}
num_buffer_samples_ += kNumSubframeSamples;
if (num_buffer_samples_ < kBufferLength) {
return 0;
}
assert(num_buffer_samples_ == kBufferLength);
features->num_frames = kNum10msSubframes;
features->silence = false;
Rms(features->rms, kMaxNumFrames);
for (size_t i = 0; i < kNum10msSubframes; ++i) {
if (features->rms[i] < kSilenceRms) {
// PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
// Bail out here instead.
features->silence = true;
ResetBuffer();
return 0;
}
}
PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
kMaxNumFrames);
FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
ResetBuffer();
return 0;
}
// Computes |kLpcOrder + 1| correlation coefficients.
void VadAudioProc::SubframeCorrelation(double* corr,
size_t length_corr,
size_t subframe_index) {
assert(length_corr >= kLpcOrder + 1);
double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
size_t buffer_index = subframe_index * kNumSubframeSamples;
for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
WebRtcIsac_AutoCorr(corr, windowed_audio,
kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);
}
// Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input.
// The analysis window is 15 ms long and it is centered on the first half of
// each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
// first half of each 10 ms subframe.
void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) {
assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1));
double corr[kLpcOrder + 1];
double reflec_coeff[kLpcOrder];
for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes;
i++, offset_lpc += kLpcOrder + 1) {
SubframeCorrelation(corr, kLpcOrder + 1, i);
corr[0] *= 1.0001;
// This makes Lev-Durb a bit more stable.
for (size_t k = 0; k < kLpcOrder + 1; k++) {
corr[k] *= kCorrWeight[k];
}
WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
}
}
// Fit a second order curve to these 3 points and find the location of the
// extremum. The points are inverted before curve fitting.
static float QuadraticInterpolation(float prev_val,
float curr_val,
float next_val) {
// Doing the interpolation in |1 / A(z)|^2.
float fractional_index = 0;
next_val = 1.0f / next_val;
prev_val = 1.0f / prev_val;
curr_val = 1.0f / curr_val;
fractional_index =
-(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);
assert(fabs(fractional_index) < 1);
return fractional_index;
}
// 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope
// of the input signal. The local maximum of the spectral envelope corresponds
// with the local minimum of A(z). It saves complexity, as we save one
// inversion. Furthermore, we find the first local maximum of magnitude squared,
// to save on one square root.
void VadAudioProc::FindFirstSpectralPeaks(double* f_peak,
size_t length_f_peak) {
assert(length_f_peak >= kNum10msSubframes);
double lpc[kNum10msSubframes * (kLpcOrder + 1)];
// For all sub-frames.
GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
const size_t kNumDftCoefficients = kDftSize / 2 + 1;
float data[kDftSize];
for (size_t i = 0; i < kNum10msSubframes; i++) {
// Convert to float with zero pad.
memset(data, 0, sizeof(data));
for (size_t n = 0; n < kLpcOrder + 1; n++) {
data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
}
// Transform to frequency domain.
WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
size_t index_peak = 0;
float prev_magn_sqr = data[0] * data[0];
float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
float next_magn_sqr;
bool found_peak = false;
for (size_t n = 2; n < kNumDftCoefficients - 1; n++) {
next_magn_sqr =
data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];
if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
found_peak = true;
index_peak = n - 1;
break;
}
prev_magn_sqr = curr_magn_sqr;
curr_magn_sqr = next_magn_sqr;
}
float fractional_index = 0;
if (!found_peak) {
// Checking if |kNumDftCoefficients - 1| is the local minimum.
next_magn_sqr = data[1] * data[1];
if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
index_peak = kNumDftCoefficients - 1;
}
} else {
// A peak is found, do a simple quadratic interpolation to get a more
// accurate estimate of the peak location.
fractional_index =
QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);
}
f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
}
}
// Using iSAC functions to estimate pitch gains & lags.
void VadAudioProc::PitchAnalysis(double* log_pitch_gains,
double* pitch_lags_hz,
size_t length) {
// TODO(turajs): This can be "imported" from iSAC & and the next two
// constants.
assert(length >= kNum10msSubframes);
const int kNumPitchSubframes = 4;
double gains[kNumPitchSubframes];
double lags[kNumPitchSubframes];
const int kNumSubbandFrameSamples = 240;
const int kNumLookaheadSamples = 24;
float lower[kNumSubbandFrameSamples];
float upper[kNumSubbandFrameSamples];
double lower_lookahead[kNumSubbandFrameSamples];
double upper_lookahead[kNumSubbandFrameSamples];
double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
kNumLookaheadSamples];
// Split signal to lower and upper bands
WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,
upper, lower_lookahead, upper_lookahead,
pre_filter_handle_.get());
WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
pitch_analysis_handle_.get(), lags, gains);
// Lags are computed on lower-band signal with sampling rate half of the
// input signal.
GetSubframesPitchParameters(
kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,
&log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);
}
void VadAudioProc::Rms(double* rms, size_t length_rms) {
assert(length_rms >= kNum10msSubframes);
size_t offset = kNumPastSignalSamples;
for (size_t i = 0; i < kNum10msSubframes; i++) {
rms[i] = 0;
for (size_t n = 0; n < kNumSubframeSamples; n++, offset++)
rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
rms[i] = sqrt(rms[i] / kNumSubframeSamples);
}
}
} // namespace webrtc

View File

@ -0,0 +1,89 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_H_
#include "webrtc/base/scoped_ptr.h"
#include "webrtc/modules/audio_processing/vad/common.h"
#include "webrtc/typedefs.h"
namespace webrtc {
class AudioFrame;
class PoleZeroFilter;
class VadAudioProc {
public:
// Forward declare iSAC structs.
struct PitchAnalysisStruct;
struct PreFiltBankstr;
VadAudioProc();
~VadAudioProc();
int ExtractFeatures(const int16_t* audio_frame,
size_t length,
AudioFeatures* audio_features);
static const size_t kDftSize = 512;
private:
void PitchAnalysis(double* pitch_gains, double* pitch_lags_hz, size_t length);
void SubframeCorrelation(double* corr,
size_t length_corr,
size_t subframe_index);
void GetLpcPolynomials(double* lpc, size_t length_lpc);
void FindFirstSpectralPeaks(double* f_peak, size_t length_f_peak);
void Rms(double* rms, size_t length_rms);
void ResetBuffer();
// To compute spectral peak we perform LPC analysis to get spectral envelope.
// For every 30 ms we compute 3 spectral peak there for 3 LPC analysis.
// LPC is computed over 15 ms of windowed audio. For every 10 ms sub-frame
// we need 5 ms of past signal to create the input of LPC analysis.
static const size_t kNumPastSignalSamples =
static_cast<size_t>(kSampleRateHz / 200);
// TODO(turajs): maybe defining this at a higher level (maybe enum) so that
// all the code recognize it as "no-error."
static const int kNoError = 0;
static const size_t kNum10msSubframes = 3;
static const size_t kNumSubframeSamples =
static_cast<size_t>(kSampleRateHz / 100);
static const size_t kNumSamplesToProcess =
kNum10msSubframes *
kNumSubframeSamples; // Samples in 30 ms @ given sampling rate.
static const size_t kBufferLength =
kNumPastSignalSamples + kNumSamplesToProcess;
static const size_t kIpLength = kDftSize >> 1;
static const size_t kWLength = kDftSize >> 1;
static const size_t kLpcOrder = 16;
size_t ip_[kIpLength];
float w_fft_[kWLength];
// A buffer of 5 ms (past audio) + 30 ms (one iSAC frame ).
float audio_buffer_[kBufferLength];
size_t num_buffer_samples_;
double log_old_gain_;
double old_lag_;
rtc::scoped_ptr<PitchAnalysisStruct> pitch_analysis_handle_;
rtc::scoped_ptr<PreFiltBankstr> pre_filter_handle_;
rtc::scoped_ptr<PoleZeroFilter> high_pass_filter_;
};
} // namespace webrtc
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_H_

View File

@ -0,0 +1,94 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_INTERNAL_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_INTERNAL_H_
namespace webrtc {
// These values should match MATLAB counterparts for unit-tests to pass.
static const double kCorrWeight[] = {1.000000,
0.985000,
0.970225,
0.955672,
0.941337,
0.927217,
0.913308,
0.899609,
0.886115,
0.872823,
0.859730,
0.846834,
0.834132,
0.821620,
0.809296,
0.797156,
0.785199};
static const double kLpcAnalWin[] = {
0.00000000, 0.01314436, 0.02628645, 0.03942400, 0.05255473, 0.06567639,
0.07878670, 0.09188339, 0.10496421, 0.11802689, 0.13106918, 0.14408883,
0.15708358, 0.17005118, 0.18298941, 0.19589602, 0.20876878, 0.22160547,
0.23440387, 0.24716177, 0.25987696, 0.27254725, 0.28517045, 0.29774438,
0.31026687, 0.32273574, 0.33514885, 0.34750406, 0.35979922, 0.37203222,
0.38420093, 0.39630327, 0.40833713, 0.42030043, 0.43219112, 0.44400713,
0.45574642, 0.46740697, 0.47898676, 0.49048379, 0.50189608, 0.51322164,
0.52445853, 0.53560481, 0.54665854, 0.55761782, 0.56848075, 0.57924546,
0.58991008, 0.60047278, 0.61093173, 0.62128512, 0.63153117, 0.64166810,
0.65169416, 0.66160761, 0.67140676, 0.68108990, 0.69065536, 0.70010148,
0.70942664, 0.71862923, 0.72770765, 0.73666033, 0.74548573, 0.75418233,
0.76274862, 0.77118312, 0.77948437, 0.78765094, 0.79568142, 0.80357442,
0.81132858, 0.81894256, 0.82641504, 0.83374472, 0.84093036, 0.84797069,
0.85486451, 0.86161063, 0.86820787, 0.87465511, 0.88095122, 0.88709512,
0.89308574, 0.89892206, 0.90460306, 0.91012776, 0.91549520, 0.92070447,
0.92575465, 0.93064488, 0.93537432, 0.93994213, 0.94434755, 0.94858979,
0.95266814, 0.95658189, 0.96033035, 0.96391289, 0.96732888, 0.97057773,
0.97365889, 0.97657181, 0.97931600, 0.98189099, 0.98429632, 0.98653158,
0.98859639, 0.99049038, 0.99221324, 0.99376466, 0.99514438, 0.99635215,
0.99738778, 0.99825107, 0.99894188, 0.99946010, 0.99980562, 0.99997840,
0.99997840, 0.99980562, 0.99946010, 0.99894188, 0.99825107, 0.99738778,
0.99635215, 0.99514438, 0.99376466, 0.99221324, 0.99049038, 0.98859639,
0.98653158, 0.98429632, 0.98189099, 0.97931600, 0.97657181, 0.97365889,
0.97057773, 0.96732888, 0.96391289, 0.96033035, 0.95658189, 0.95266814,
0.94858979, 0.94434755, 0.93994213, 0.93537432, 0.93064488, 0.92575465,
0.92070447, 0.91549520, 0.91012776, 0.90460306, 0.89892206, 0.89308574,
0.88709512, 0.88095122, 0.87465511, 0.86820787, 0.86161063, 0.85486451,
0.84797069, 0.84093036, 0.83374472, 0.82641504, 0.81894256, 0.81132858,
0.80357442, 0.79568142, 0.78765094, 0.77948437, 0.77118312, 0.76274862,
0.75418233, 0.74548573, 0.73666033, 0.72770765, 0.71862923, 0.70942664,
0.70010148, 0.69065536, 0.68108990, 0.67140676, 0.66160761, 0.65169416,
0.64166810, 0.63153117, 0.62128512, 0.61093173, 0.60047278, 0.58991008,
0.57924546, 0.56848075, 0.55761782, 0.54665854, 0.53560481, 0.52445853,
0.51322164, 0.50189608, 0.49048379, 0.47898676, 0.46740697, 0.45574642,
0.44400713, 0.43219112, 0.42030043, 0.40833713, 0.39630327, 0.38420093,
0.37203222, 0.35979922, 0.34750406, 0.33514885, 0.32273574, 0.31026687,
0.29774438, 0.28517045, 0.27254725, 0.25987696, 0.24716177, 0.23440387,
0.22160547, 0.20876878, 0.19589602, 0.18298941, 0.17005118, 0.15708358,
0.14408883, 0.13106918, 0.11802689, 0.10496421, 0.09188339, 0.07878670,
0.06567639, 0.05255473, 0.03942400, 0.02628645, 0.01314436, 0.00000000};
static const size_t kFilterOrder = 2;
static const float kCoeffNumerator[kFilterOrder + 1] = {0.974827f,
-1.949650f,
0.974827f};
static const float kCoeffDenominator[kFilterOrder + 1] = {1.0f,
-1.971999f,
0.972457f};
static_assert(kFilterOrder + 1 ==
sizeof(kCoeffNumerator) / sizeof(kCoeffNumerator[0]),
"numerator coefficients incorrect size");
static_assert(kFilterOrder + 1 ==
sizeof(kCoeffDenominator) / sizeof(kCoeffDenominator[0]),
"denominator coefficients incorrect size");
} // namespace webrtc
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROCESSING_H_

View File

@ -0,0 +1,138 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_processing/vad/vad_circular_buffer.h"
#include <assert.h>
#include <stdlib.h>
namespace webrtc {
VadCircularBuffer::VadCircularBuffer(int buffer_size)
: buffer_(new double[buffer_size]),
is_full_(false),
index_(0),
buffer_size_(buffer_size),
sum_(0) {
}
VadCircularBuffer::~VadCircularBuffer() {
}
void VadCircularBuffer::Reset() {
is_full_ = false;
index_ = 0;
sum_ = 0;
}
VadCircularBuffer* VadCircularBuffer::Create(int buffer_size) {
if (buffer_size <= 0)
return NULL;
return new VadCircularBuffer(buffer_size);
}
double VadCircularBuffer::Oldest() const {
if (!is_full_)
return buffer_[0];
else
return buffer_[index_];
}
double VadCircularBuffer::Mean() {
double m;
if (is_full_) {
m = sum_ / buffer_size_;
} else {
if (index_ > 0)
m = sum_ / index_;
else
m = 0;
}
return m;
}
void VadCircularBuffer::Insert(double value) {
if (is_full_) {
sum_ -= buffer_[index_];
}
sum_ += value;
buffer_[index_] = value;
index_++;
if (index_ >= buffer_size_) {
is_full_ = true;
index_ = 0;
}
}
int VadCircularBuffer::BufferLevel() {
if (is_full_)
return buffer_size_;
return index_;
}
int VadCircularBuffer::Get(int index, double* value) const {
int err = ConvertToLinearIndex(&index);
if (err < 0)
return -1;
*value = buffer_[index];
return 0;
}
int VadCircularBuffer::Set(int index, double value) {
int err = ConvertToLinearIndex(&index);
if (err < 0)
return -1;
sum_ -= buffer_[index];
buffer_[index] = value;
sum_ += value;
return 0;
}
int VadCircularBuffer::ConvertToLinearIndex(int* index) const {
if (*index < 0 || *index >= buffer_size_)
return -1;
if (!is_full_ && *index >= index_)
return -1;
*index = index_ - 1 - *index;
if (*index < 0)
*index += buffer_size_;
return 0;
}
int VadCircularBuffer::RemoveTransient(int width_threshold,
double val_threshold) {
if (!is_full_ && index_ < width_threshold + 2)
return 0;
int index_1 = 0;
int index_2 = width_threshold + 1;
double v = 0;
if (Get(index_1, &v) < 0)
return -1;
if (v < val_threshold) {
Set(index_1, 0);
int index;
for (index = index_2; index > index_1; index--) {
if (Get(index, &v) < 0)
return -1;
if (v < val_threshold)
break;
}
for (; index > index_1; index--) {
if (Set(index, 0.0) < 0)
return -1;
}
}
return 0;
}
} // namespace webrtc

View File

@ -0,0 +1,69 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_CIRCULAR_BUFFER_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_CIRCULAR_BUFFER_H_
#include "webrtc/base/scoped_ptr.h"
namespace webrtc {
// A circular buffer tailored to the need of this project. It stores last
// K samples of the input, and keeps track of the mean of the last samples.
//
// It is used in class "PitchBasedActivity" to keep track of posterior
// probabilities in the past few seconds. The posterior probabilities are used
// to recursively update prior probabilities.
class VadCircularBuffer {
public:
static VadCircularBuffer* Create(int buffer_size);
~VadCircularBuffer();
// If buffer is wrapped around.
bool is_full() const { return is_full_; }
// Get the oldest entry in the buffer.
double Oldest() const;
// Insert new value into the buffer.
void Insert(double value);
// Reset buffer, forget the past, start fresh.
void Reset();
// The mean value of the elements in the buffer. The return value is zero if
// buffer is empty, i.e. no value is inserted.
double Mean();
// Remove transients. If the values exceed |val_threshold| for a period
// shorter then or equal to |width_threshold|, then that period is considered
// transient and set to zero.
int RemoveTransient(int width_threshold, double val_threshold);
private:
explicit VadCircularBuffer(int buffer_size);
// Get previous values. |index = 0| corresponds to the most recent
// insertion. |index = 1| is the one before the most recent insertion, and
// so on.
int Get(int index, double* value) const;
// Set a given position to |value|. |index| is interpreted as above.
int Set(int index, double value);
// Return the number of valid elements in the buffer.
int BufferLevel();
// Convert an index with the interpretation as get() method to the
// corresponding linear index.
int ConvertToLinearIndex(int* index) const;
rtc::scoped_ptr<double[]> buffer_;
bool is_full_;
int index_;
int buffer_size_;
double sum_;
};
} // namespace webrtc
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_CIRCULAR_BUFFER_H_

View File

@ -0,0 +1,85 @@
/*
* Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"
#include <algorithm>
#include "webrtc/base/checks.h"
namespace webrtc {
namespace {
const size_t kMaxLength = 320;
const int kNumChannels = 1;
const double kDefaultVoiceValue = 1.0;
const double kNeutralProbability = 0.5;
const double kLowProbability = 0.01;
} // namespace
VoiceActivityDetector::VoiceActivityDetector()
: last_voice_probability_(kDefaultVoiceValue),
standalone_vad_(StandaloneVad::Create()) {
}
// Because ISAC has a different chunk length, it updates
// |chunkwise_voice_probabilities_| and |chunkwise_rms_| when there is new data.
// Otherwise it clears them.
void VoiceActivityDetector::ProcessChunk(const int16_t* audio,
size_t length,
int sample_rate_hz) {
RTC_DCHECK_EQ(static_cast<int>(length), sample_rate_hz / 100);
RTC_DCHECK_LE(length, kMaxLength);
// Resample to the required rate.
const int16_t* resampled_ptr = audio;
if (sample_rate_hz != kSampleRateHz) {
RTC_CHECK_EQ(
resampler_.ResetIfNeeded(sample_rate_hz, kSampleRateHz, kNumChannels),
0);
resampler_.Push(audio, length, resampled_, kLength10Ms, length);
resampled_ptr = resampled_;
}
RTC_DCHECK_EQ(length, kLength10Ms);
// Each chunk needs to be passed into |standalone_vad_|, because internally it
// buffers the audio and processes it all at once when GetActivity() is
// called.
RTC_CHECK_EQ(standalone_vad_->AddAudio(resampled_ptr, length), 0);
audio_processing_.ExtractFeatures(resampled_ptr, length, &features_);
chunkwise_voice_probabilities_.resize(features_.num_frames);
chunkwise_rms_.resize(features_.num_frames);
std::copy(features_.rms, features_.rms + chunkwise_rms_.size(),
chunkwise_rms_.begin());
if (features_.num_frames > 0) {
if (features_.silence) {
// The other features are invalid, so set the voice probabilities to an
// arbitrary low value.
std::fill(chunkwise_voice_probabilities_.begin(),
chunkwise_voice_probabilities_.end(), kLowProbability);
} else {
std::fill(chunkwise_voice_probabilities_.begin(),
chunkwise_voice_probabilities_.end(), kNeutralProbability);
RTC_CHECK_GE(
standalone_vad_->GetActivity(&chunkwise_voice_probabilities_[0],
chunkwise_voice_probabilities_.size()),
0);
RTC_CHECK_GE(pitch_based_vad_.VoicingProbability(
features_, &chunkwise_voice_probabilities_[0]),
0);
}
last_voice_probability_ = chunkwise_voice_probabilities_.back();
}
}
} // namespace webrtc

View File

@ -0,0 +1,70 @@
/*
* Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_ACTIVITY_DETECTOR_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_ACTIVITY_DETECTOR_H_
#include <vector>
#include "webrtc/base/scoped_ptr.h"
#include "webrtc/common_audio/resampler/include/resampler.h"
#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
#include "webrtc/modules/audio_processing/vad/common.h"
#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"
#include "webrtc/modules/audio_processing/vad/standalone_vad.h"
namespace webrtc {
// A Voice Activity Detector (VAD) that combines the voice probability from the
// StandaloneVad and PitchBasedVad to get a more robust estimation.
class VoiceActivityDetector {
public:
VoiceActivityDetector();
// Processes each audio chunk and estimates the voice probability. The maximum
// supported sample rate is 32kHz.
// TODO(aluebs): Change |length| to size_t.
void ProcessChunk(const int16_t* audio, size_t length, int sample_rate_hz);
// Returns a vector of voice probabilities for each chunk. It can be empty for
// some chunks, but it catches up afterwards returning multiple values at
// once.
const std::vector<double>& chunkwise_voice_probabilities() const {
return chunkwise_voice_probabilities_;
}
// Returns a vector of RMS values for each chunk. It has the same length as
// chunkwise_voice_probabilities().
const std::vector<double>& chunkwise_rms() const { return chunkwise_rms_; }
// Returns the last voice probability, regardless of the internal
// implementation, although it has a few chunks of delay.
float last_voice_probability() const { return last_voice_probability_; }
private:
// TODO(aluebs): Change these to float.
std::vector<double> chunkwise_voice_probabilities_;
std::vector<double> chunkwise_rms_;
float last_voice_probability_;
Resampler resampler_;
VadAudioProc audio_processing_;
rtc::scoped_ptr<StandaloneVad> standalone_vad_;
PitchBasedVad pitch_based_vad_;
int16_t resampled_[kLength10Ms];
AudioFeatures features_;
};
} // namespace webrtc
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_ACTIVITY_DETECTOR_H_

View File

@ -0,0 +1,85 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
// GMM tables for active segments. Generated by MakeGmmTables.m.
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_GMM_TABLES_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_GMM_TABLES_H_
static const int kVoiceGmmNumMixtures = 12;
static const int kVoiceGmmDim = 3;
static const double
kVoiceGmmCovarInverse[kVoiceGmmNumMixtures][kVoiceGmmDim][kVoiceGmmDim] = {
{{1.83673825579513e+00, -8.09791637570095e-04, 4.60106414365986e-03},
{-8.09791637570095e-04, 8.89351738394608e-04, -9.80188953277734e-04},
{4.60106414365986e-03, -9.80188953277734e-04, 1.38706060206582e-03}},
{{6.76228912850703e+01, -1.98893120119660e-02, -3.53548357253551e-03},
{-1.98893120119660e-02, 3.96216858500530e-05, -4.08492938394097e-05},
{-3.53548357253551e-03, -4.08492938394097e-05, 9.31864352856416e-04}},
{{9.98612435944558e+00, -5.27880954316893e-03, -6.30342541619017e-03},
{-5.27880954316893e-03, 4.54359480225226e-05, 6.30804591626044e-05},
{-6.30342541619017e-03, 6.30804591626044e-05, 5.36466441382942e-04}},
{{3.39917474216349e+01, -1.56213579433191e-03, -4.01459014990225e-02},
{-1.56213579433191e-03, 6.40415424897724e-05, 6.20076342427833e-05},
{-4.01459014990225e-02, 6.20076342427833e-05, 3.51199070103063e-03}},
{{1.34545062271428e+01, -7.94513610147144e-03, -5.34401019341728e-02},
{-7.94513610147144e-03, 1.16511820098649e-04, 4.66063702069293e-05},
{-5.34401019341728e-02, 4.66063702069293e-05, 2.72354323774163e-03}},
{{1.08557844314806e+02, -1.54885805673668e-02, -1.88029692674851e-02},
{-1.54885805673668e-02, 1.16404042786406e-04, 6.45579292702802e-06},
{-1.88029692674851e-02, 6.45579292702802e-06, 4.32330478391416e-04}},
{{8.22940066541450e+01, -1.15903110231303e-02, -4.92166764865343e-02},
{-1.15903110231303e-02, 7.42510742165261e-05, 3.73007314191290e-06},
{-4.92166764865343e-02, 3.73007314191290e-06, 3.64005221593244e-03}},
{{2.31133605685660e+00, -7.83261568950254e-04, 7.45744012346313e-04},
{-7.83261568950254e-04, 1.29460648214142e-05, -2.22774455093730e-06},
{7.45744012346313e-04, -2.22774455093730e-06, 1.05117294093010e-04}},
{{3.78767849189611e+02, 1.57759761011568e-03, -2.08551217988774e-02},
{1.57759761011568e-03, 4.76066236886865e-05, -2.33977412299324e-05},
{-2.08551217988774e-02, -2.33977412299324e-05, 5.24261005371196e-04}},
{{6.98580096506135e-01, -5.13850255217378e-04, -4.01124551717056e-04},
{-5.13850255217378e-04, 1.40501021984840e-06, -2.09496928716569e-06},
{-4.01124551717056e-04, -2.09496928716569e-06, 2.82879357740037e-04}},
{{2.62770945162399e+00, -2.31825753241430e-03, -5.30447217466318e-03},
{-2.31825753241430e-03, 4.59108572227649e-05, 7.67631886355405e-05},
{-5.30447217466318e-03, 7.67631886355405e-05, 2.28521601674098e-03}},
{{1.89940391362152e+02, -4.23280856852379e-03, -2.70608873541399e-02},
{-4.23280856852379e-03, 6.77547582742563e-05, 2.69154203800467e-05},
{-2.70608873541399e-02, 2.69154203800467e-05, 3.88574543373470e-03}}};
static const double kVoiceGmmMean[kVoiceGmmNumMixtures][kVoiceGmmDim] = {
{-2.15020241646536e+00, 4.97079062999877e+02, 4.77078119504505e+02},
{-8.92097680029190e-01, 5.92064964199921e+02, 1.81045145941059e+02},
{-1.29435784144398e+00, 4.98450293410611e+02, 1.71991263804064e+02},
{-1.03925228397884e+00, 4.99511274321571e+02, 1.05838336539105e+02},
{-1.29229047206129e+00, 4.15026762566707e+02, 1.12861119017125e+02},
{-7.88748114599810e-01, 4.48739336688113e+02, 1.89784216956337e+02},
{-8.77777402332642e-01, 4.86620285054533e+02, 1.13477708016491e+02},
{-2.06465957063057e+00, 6.33385049870607e+02, 2.32758546796149e+02},
{-6.98893789231685e-01, 5.93622051503385e+02, 1.92536982473203e+02},
{-2.55901217508894e+00, 1.55914919756205e+03, 1.39769980835570e+02},
{-1.92070024165837e+00, 4.87983940444185e+02, 1.02745468128289e+02},
{-7.29187507662854e-01, 5.22717685022855e+02, 1.16377942283991e+02}};
static const double kVoiceGmmWeights[kVoiceGmmNumMixtures] = {
-1.39789694361035e+01,
-1.19527720202104e+01,
-1.32396317929055e+01,
-1.09436815209238e+01,
-1.13440027478149e+01,
-1.12200721834504e+01,
-1.02537324043693e+01,
-1.60789861938302e+01,
-1.03394494048344e+01,
-1.83207938586818e+01,
-1.31186044948288e+01,
-9.52479998673554e+00};
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_GMM_TABLES_H_