Update audio_processing module
Corresponds to upstream commit 524e9b043e7e86fd72353b987c9d5f6a1ebf83e1 Update notes: * Pull in third party license file * Replace .gypi files with BUILD.gn to keep track of what changes upstream * Bunch of new filse pulled in as dependencies * Won't build yet due to changes needed on top of these
This commit is contained in:
275
webrtc/modules/audio_processing/vad/vad_audio_proc.cc
Normal file
275
webrtc/modules/audio_processing/vad/vad_audio_proc.cc
Normal file
@ -0,0 +1,275 @@
|
||||
/*
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "webrtc/common_audio/fft4g.h"
|
||||
#include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h"
|
||||
#include "webrtc/modules/audio_processing/vad/pitch_internal.h"
|
||||
#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"
|
||||
extern "C" {
|
||||
#include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"
|
||||
#include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"
|
||||
#include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
|
||||
#include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"
|
||||
}
|
||||
#include "webrtc/modules/interface/module_common_types.h"
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
// The following structures are declared anonymous in iSAC's structs.h. To
|
||||
// forward declare them, we use this derived class trick.
|
||||
struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
|
||||
struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
|
||||
|
||||
static const float kFrequencyResolution =
|
||||
kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);
|
||||
static const int kSilenceRms = 5;
|
||||
|
||||
// TODO(turajs): Make a Create or Init for VadAudioProc.
|
||||
VadAudioProc::VadAudioProc()
|
||||
: audio_buffer_(),
|
||||
num_buffer_samples_(kNumPastSignalSamples),
|
||||
log_old_gain_(-2),
|
||||
old_lag_(50), // Arbitrary but valid as pitch-lag (in samples).
|
||||
pitch_analysis_handle_(new PitchAnalysisStruct),
|
||||
pre_filter_handle_(new PreFiltBankstr),
|
||||
high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,
|
||||
kFilterOrder,
|
||||
kCoeffDenominator,
|
||||
kFilterOrder)) {
|
||||
static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
|
||||
sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
|
||||
"lpc analysis window incorrect size");
|
||||
static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
|
||||
"correlation weight incorrect size");
|
||||
|
||||
// TODO(turajs): Are we doing too much in the constructor?
|
||||
float data[kDftSize];
|
||||
// Make FFT to initialize.
|
||||
ip_[0] = 0;
|
||||
WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
|
||||
// TODO(turajs): Need to initialize high-pass filter.
|
||||
|
||||
// Initialize iSAC components.
|
||||
WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
|
||||
WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
|
||||
}
|
||||
|
||||
VadAudioProc::~VadAudioProc() {
|
||||
}
|
||||
|
||||
void VadAudioProc::ResetBuffer() {
|
||||
memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
|
||||
sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
|
||||
num_buffer_samples_ = kNumPastSignalSamples;
|
||||
}
|
||||
|
||||
int VadAudioProc::ExtractFeatures(const int16_t* frame,
|
||||
size_t length,
|
||||
AudioFeatures* features) {
|
||||
features->num_frames = 0;
|
||||
if (length != kNumSubframeSamples) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// High-pass filter to remove the DC component and very low frequency content.
|
||||
// We have experienced that this high-pass filtering improves voice/non-voiced
|
||||
// classification.
|
||||
if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
|
||||
&audio_buffer_[num_buffer_samples_]) != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
num_buffer_samples_ += kNumSubframeSamples;
|
||||
if (num_buffer_samples_ < kBufferLength) {
|
||||
return 0;
|
||||
}
|
||||
assert(num_buffer_samples_ == kBufferLength);
|
||||
features->num_frames = kNum10msSubframes;
|
||||
features->silence = false;
|
||||
|
||||
Rms(features->rms, kMaxNumFrames);
|
||||
for (size_t i = 0; i < kNum10msSubframes; ++i) {
|
||||
if (features->rms[i] < kSilenceRms) {
|
||||
// PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
|
||||
// Bail out here instead.
|
||||
features->silence = true;
|
||||
ResetBuffer();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
|
||||
kMaxNumFrames);
|
||||
FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
|
||||
ResetBuffer();
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Computes |kLpcOrder + 1| correlation coefficients.
|
||||
void VadAudioProc::SubframeCorrelation(double* corr,
|
||||
size_t length_corr,
|
||||
size_t subframe_index) {
|
||||
assert(length_corr >= kLpcOrder + 1);
|
||||
double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
|
||||
size_t buffer_index = subframe_index * kNumSubframeSamples;
|
||||
|
||||
for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
|
||||
windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
|
||||
|
||||
WebRtcIsac_AutoCorr(corr, windowed_audio,
|
||||
kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);
|
||||
}
|
||||
|
||||
// Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input.
|
||||
// The analysis window is 15 ms long and it is centered on the first half of
|
||||
// each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
|
||||
// first half of each 10 ms subframe.
|
||||
void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) {
|
||||
assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1));
|
||||
double corr[kLpcOrder + 1];
|
||||
double reflec_coeff[kLpcOrder];
|
||||
for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes;
|
||||
i++, offset_lpc += kLpcOrder + 1) {
|
||||
SubframeCorrelation(corr, kLpcOrder + 1, i);
|
||||
corr[0] *= 1.0001;
|
||||
// This makes Lev-Durb a bit more stable.
|
||||
for (size_t k = 0; k < kLpcOrder + 1; k++) {
|
||||
corr[k] *= kCorrWeight[k];
|
||||
}
|
||||
WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
|
||||
}
|
||||
}
|
||||
|
||||
// Fit a second order curve to these 3 points and find the location of the
|
||||
// extremum. The points are inverted before curve fitting.
|
||||
static float QuadraticInterpolation(float prev_val,
|
||||
float curr_val,
|
||||
float next_val) {
|
||||
// Doing the interpolation in |1 / A(z)|^2.
|
||||
float fractional_index = 0;
|
||||
next_val = 1.0f / next_val;
|
||||
prev_val = 1.0f / prev_val;
|
||||
curr_val = 1.0f / curr_val;
|
||||
|
||||
fractional_index =
|
||||
-(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);
|
||||
assert(fabs(fractional_index) < 1);
|
||||
return fractional_index;
|
||||
}
|
||||
|
||||
// 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope
|
||||
// of the input signal. The local maximum of the spectral envelope corresponds
|
||||
// with the local minimum of A(z). It saves complexity, as we save one
|
||||
// inversion. Furthermore, we find the first local maximum of magnitude squared,
|
||||
// to save on one square root.
|
||||
void VadAudioProc::FindFirstSpectralPeaks(double* f_peak,
|
||||
size_t length_f_peak) {
|
||||
assert(length_f_peak >= kNum10msSubframes);
|
||||
double lpc[kNum10msSubframes * (kLpcOrder + 1)];
|
||||
// For all sub-frames.
|
||||
GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
|
||||
|
||||
const size_t kNumDftCoefficients = kDftSize / 2 + 1;
|
||||
float data[kDftSize];
|
||||
|
||||
for (size_t i = 0; i < kNum10msSubframes; i++) {
|
||||
// Convert to float with zero pad.
|
||||
memset(data, 0, sizeof(data));
|
||||
for (size_t n = 0; n < kLpcOrder + 1; n++) {
|
||||
data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
|
||||
}
|
||||
// Transform to frequency domain.
|
||||
WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
|
||||
|
||||
size_t index_peak = 0;
|
||||
float prev_magn_sqr = data[0] * data[0];
|
||||
float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
|
||||
float next_magn_sqr;
|
||||
bool found_peak = false;
|
||||
for (size_t n = 2; n < kNumDftCoefficients - 1; n++) {
|
||||
next_magn_sqr =
|
||||
data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];
|
||||
if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
|
||||
found_peak = true;
|
||||
index_peak = n - 1;
|
||||
break;
|
||||
}
|
||||
prev_magn_sqr = curr_magn_sqr;
|
||||
curr_magn_sqr = next_magn_sqr;
|
||||
}
|
||||
float fractional_index = 0;
|
||||
if (!found_peak) {
|
||||
// Checking if |kNumDftCoefficients - 1| is the local minimum.
|
||||
next_magn_sqr = data[1] * data[1];
|
||||
if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
|
||||
index_peak = kNumDftCoefficients - 1;
|
||||
}
|
||||
} else {
|
||||
// A peak is found, do a simple quadratic interpolation to get a more
|
||||
// accurate estimate of the peak location.
|
||||
fractional_index =
|
||||
QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);
|
||||
}
|
||||
f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
|
||||
}
|
||||
}
|
||||
|
||||
// Using iSAC functions to estimate pitch gains & lags.
|
||||
void VadAudioProc::PitchAnalysis(double* log_pitch_gains,
|
||||
double* pitch_lags_hz,
|
||||
size_t length) {
|
||||
// TODO(turajs): This can be "imported" from iSAC & and the next two
|
||||
// constants.
|
||||
assert(length >= kNum10msSubframes);
|
||||
const int kNumPitchSubframes = 4;
|
||||
double gains[kNumPitchSubframes];
|
||||
double lags[kNumPitchSubframes];
|
||||
|
||||
const int kNumSubbandFrameSamples = 240;
|
||||
const int kNumLookaheadSamples = 24;
|
||||
|
||||
float lower[kNumSubbandFrameSamples];
|
||||
float upper[kNumSubbandFrameSamples];
|
||||
double lower_lookahead[kNumSubbandFrameSamples];
|
||||
double upper_lookahead[kNumSubbandFrameSamples];
|
||||
double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
|
||||
kNumLookaheadSamples];
|
||||
|
||||
// Split signal to lower and upper bands
|
||||
WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,
|
||||
upper, lower_lookahead, upper_lookahead,
|
||||
pre_filter_handle_.get());
|
||||
WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
|
||||
pitch_analysis_handle_.get(), lags, gains);
|
||||
|
||||
// Lags are computed on lower-band signal with sampling rate half of the
|
||||
// input signal.
|
||||
GetSubframesPitchParameters(
|
||||
kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,
|
||||
&log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);
|
||||
}
|
||||
|
||||
void VadAudioProc::Rms(double* rms, size_t length_rms) {
|
||||
assert(length_rms >= kNum10msSubframes);
|
||||
size_t offset = kNumPastSignalSamples;
|
||||
for (size_t i = 0; i < kNum10msSubframes; i++) {
|
||||
rms[i] = 0;
|
||||
for (size_t n = 0; n < kNumSubframeSamples; n++, offset++)
|
||||
rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
|
||||
rms[i] = sqrt(rms[i] / kNumSubframeSamples);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace webrtc
|
Reference in New Issue
Block a user