Bump to WebRTC M120 release
Some API deprecation -- ExperimentalAgc and ExperimentalNs are gone. We're continuing to carry iSAC even though it's gone upstream, but maybe we'll want to drop that soon.
This commit is contained in:
174
webrtc/modules/audio_processing/agc2/speech_level_estimator.cc
Normal file
174
webrtc/modules/audio_processing/agc2/speech_level_estimator.cc
Normal file
@ -0,0 +1,174 @@
|
||||
/*
|
||||
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "modules/audio_processing/agc2/speech_level_estimator.h"
|
||||
|
||||
#include "modules/audio_processing/agc2/agc2_common.h"
|
||||
#include "modules/audio_processing/logging/apm_data_dumper.h"
|
||||
#include "rtc_base/checks.h"
|
||||
#include "rtc_base/logging.h"
|
||||
#include "rtc_base/numerics/safe_minmax.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace {
|
||||
|
||||
float ClampLevelEstimateDbfs(float level_estimate_dbfs) {
|
||||
return rtc::SafeClamp<float>(level_estimate_dbfs, -90.0f, 30.0f);
|
||||
}
|
||||
|
||||
// Returns the initial speech level estimate needed to apply the initial gain.
|
||||
float GetInitialSpeechLevelEstimateDbfs(
|
||||
const AudioProcessing::Config::GainController2::AdaptiveDigital& config) {
|
||||
return ClampLevelEstimateDbfs(-kSaturationProtectorInitialHeadroomDb -
|
||||
config.initial_gain_db - config.headroom_db);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
bool SpeechLevelEstimator::LevelEstimatorState::operator==(
|
||||
const SpeechLevelEstimator::LevelEstimatorState& b) const {
|
||||
return time_to_confidence_ms == b.time_to_confidence_ms &&
|
||||
level_dbfs.numerator == b.level_dbfs.numerator &&
|
||||
level_dbfs.denominator == b.level_dbfs.denominator;
|
||||
}
|
||||
|
||||
float SpeechLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const {
|
||||
RTC_DCHECK_NE(denominator, 0.f);
|
||||
return numerator / denominator;
|
||||
}
|
||||
|
||||
SpeechLevelEstimator::SpeechLevelEstimator(
|
||||
ApmDataDumper* apm_data_dumper,
|
||||
const AudioProcessing::Config::GainController2::AdaptiveDigital& config,
|
||||
int adjacent_speech_frames_threshold)
|
||||
: apm_data_dumper_(apm_data_dumper),
|
||||
initial_speech_level_dbfs_(GetInitialSpeechLevelEstimateDbfs(config)),
|
||||
adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold),
|
||||
level_dbfs_(initial_speech_level_dbfs_),
|
||||
// TODO(bugs.webrtc.org/7494): Remove init below when AGC2 input volume
|
||||
// controller temporal dependency removed.
|
||||
is_confident_(false) {
|
||||
RTC_DCHECK(apm_data_dumper_);
|
||||
RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1);
|
||||
Reset();
|
||||
}
|
||||
|
||||
void SpeechLevelEstimator::Update(float rms_dbfs,
|
||||
float peak_dbfs,
|
||||
float speech_probability) {
|
||||
RTC_DCHECK_GT(rms_dbfs, -150.0f);
|
||||
RTC_DCHECK_LT(rms_dbfs, 50.0f);
|
||||
RTC_DCHECK_GT(peak_dbfs, -150.0f);
|
||||
RTC_DCHECK_LT(peak_dbfs, 50.0f);
|
||||
RTC_DCHECK_GE(speech_probability, 0.0f);
|
||||
RTC_DCHECK_LE(speech_probability, 1.0f);
|
||||
if (speech_probability < kVadConfidenceThreshold) {
|
||||
// Not a speech frame.
|
||||
if (adjacent_speech_frames_threshold_ > 1) {
|
||||
// When two or more adjacent speech frames are required in order to update
|
||||
// the state, we need to decide whether to discard or confirm the updates
|
||||
// based on the speech sequence length.
|
||||
if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
|
||||
// First non-speech frame after a long enough sequence of speech frames.
|
||||
// Update the reliable state.
|
||||
reliable_state_ = preliminary_state_;
|
||||
} else if (num_adjacent_speech_frames_ > 0) {
|
||||
// First non-speech frame after a too short sequence of speech frames.
|
||||
// Reset to the last reliable state.
|
||||
preliminary_state_ = reliable_state_;
|
||||
}
|
||||
}
|
||||
num_adjacent_speech_frames_ = 0;
|
||||
} else {
|
||||
// Speech frame observed.
|
||||
num_adjacent_speech_frames_++;
|
||||
|
||||
// Update preliminary level estimate.
|
||||
RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0);
|
||||
const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0;
|
||||
if (!buffer_is_full) {
|
||||
preliminary_state_.time_to_confidence_ms -= kFrameDurationMs;
|
||||
}
|
||||
// Weighted average of levels with speech probability as weight.
|
||||
RTC_DCHECK_GT(speech_probability, 0.0f);
|
||||
const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.0f;
|
||||
preliminary_state_.level_dbfs.numerator =
|
||||
preliminary_state_.level_dbfs.numerator * leak_factor +
|
||||
rms_dbfs * speech_probability;
|
||||
preliminary_state_.level_dbfs.denominator =
|
||||
preliminary_state_.level_dbfs.denominator * leak_factor +
|
||||
speech_probability;
|
||||
|
||||
const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();
|
||||
|
||||
if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
|
||||
// `preliminary_state_` is now reliable. Update the last level estimation.
|
||||
level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs);
|
||||
}
|
||||
}
|
||||
UpdateIsConfident();
|
||||
DumpDebugData();
|
||||
}
|
||||
|
||||
void SpeechLevelEstimator::UpdateIsConfident() {
|
||||
if (adjacent_speech_frames_threshold_ == 1) {
|
||||
// Ignore `reliable_state_` when a single frame is enough to update the
|
||||
// level estimate (because it is not used).
|
||||
is_confident_ = preliminary_state_.time_to_confidence_ms == 0;
|
||||
return;
|
||||
}
|
||||
// Once confident, it remains confident.
|
||||
RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 ||
|
||||
preliminary_state_.time_to_confidence_ms == 0);
|
||||
// During the first long enough speech sequence, `reliable_state_` must be
|
||||
// ignored since `preliminary_state_` is used.
|
||||
is_confident_ =
|
||||
reliable_state_.time_to_confidence_ms == 0 ||
|
||||
(num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ &&
|
||||
preliminary_state_.time_to_confidence_ms == 0);
|
||||
}
|
||||
|
||||
void SpeechLevelEstimator::Reset() {
|
||||
ResetLevelEstimatorState(preliminary_state_);
|
||||
ResetLevelEstimatorState(reliable_state_);
|
||||
level_dbfs_ = initial_speech_level_dbfs_;
|
||||
num_adjacent_speech_frames_ = 0;
|
||||
}
|
||||
|
||||
void SpeechLevelEstimator::ResetLevelEstimatorState(
|
||||
LevelEstimatorState& state) const {
|
||||
state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs;
|
||||
state.level_dbfs.numerator = initial_speech_level_dbfs_;
|
||||
state.level_dbfs.denominator = 1.0f;
|
||||
}
|
||||
|
||||
void SpeechLevelEstimator::DumpDebugData() const {
|
||||
if (!apm_data_dumper_)
|
||||
return;
|
||||
apm_data_dumper_->DumpRaw("agc2_speech_level_dbfs", level_dbfs_);
|
||||
apm_data_dumper_->DumpRaw("agc2_speech_level_is_confident", is_confident_);
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_adaptive_level_estimator_num_adjacent_speech_frames",
|
||||
num_adjacent_speech_frames_);
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_adaptive_level_estimator_preliminary_level_estimate_num",
|
||||
preliminary_state_.level_dbfs.numerator);
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_adaptive_level_estimator_preliminary_level_estimate_den",
|
||||
preliminary_state_.level_dbfs.denominator);
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms",
|
||||
preliminary_state_.time_to_confidence_ms);
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_adaptive_level_estimator_reliable_time_to_confidence_ms",
|
||||
reliable_state_.time_to_confidence_ms);
|
||||
}
|
||||
|
||||
} // namespace webrtc
|
Reference in New Issue
Block a user