Bump to WebRTC M120 release

Some API deprecation -- ExperimentalAgc and ExperimentalNs are gone.
We're continuing to carry iSAC even though it's gone upstream, but maybe
we'll want to drop that soon.
This commit is contained in:
Arun Raghavan
2023-12-12 10:42:58 -05:00
parent 9a202fb8c2
commit c6abf6cd3f
479 changed files with 20900 additions and 11996 deletions

View File

@ -18,6 +18,7 @@
#include <deque>
#include <limits>
#include <set>
#include <string>
#include "common_audio/include/audio_util.h"
#include "common_audio/signal_processing/include/signal_processing_library.h"
@ -32,7 +33,6 @@
namespace webrtc {
static const float kMeanIIRCoefficient = 0.5f;
static const float kVoiceThreshold = 0.02f;
// TODO(aluebs): Check if these values work also for 48kHz.
static const size_t kMinVoiceBin = 3;
@ -44,10 +44,27 @@ float ComplexMagnitude(float a, float b) {
return std::abs(a) + std::abs(b);
}
std::string GetVadModeLabel(TransientSuppressor::VadMode vad_mode) {
switch (vad_mode) {
case TransientSuppressor::VadMode::kDefault:
return "default";
case TransientSuppressor::VadMode::kRnnVad:
return "RNN VAD";
case TransientSuppressor::VadMode::kNoVad:
return "no VAD";
}
}
} // namespace
TransientSuppressorImpl::TransientSuppressorImpl()
: data_length_(0),
TransientSuppressorImpl::TransientSuppressorImpl(VadMode vad_mode,
int sample_rate_hz,
int detector_rate_hz,
int num_channels)
: vad_mode_(vad_mode),
voice_probability_delay_unit_(/*delay_num_samples=*/0, sample_rate_hz),
analyzed_audio_is_silent_(false),
data_length_(0),
detection_length_(0),
analysis_length_(0),
buffer_delay_(0),
@ -62,13 +79,26 @@ TransientSuppressorImpl::TransientSuppressorImpl()
use_hard_restoration_(false),
chunks_since_voice_change_(0),
seed_(182),
using_reference_(false) {}
using_reference_(false) {
RTC_LOG(LS_INFO) << "VAD mode: " << GetVadModeLabel(vad_mode_);
Initialize(sample_rate_hz, detector_rate_hz, num_channels);
}
TransientSuppressorImpl::~TransientSuppressorImpl() {}
int TransientSuppressorImpl::Initialize(int sample_rate_hz,
int detection_rate_hz,
int num_channels) {
void TransientSuppressorImpl::Initialize(int sample_rate_hz,
int detection_rate_hz,
int num_channels) {
RTC_DCHECK(sample_rate_hz == ts::kSampleRate8kHz ||
sample_rate_hz == ts::kSampleRate16kHz ||
sample_rate_hz == ts::kSampleRate32kHz ||
sample_rate_hz == ts::kSampleRate48kHz);
RTC_DCHECK(detection_rate_hz == ts::kSampleRate8kHz ||
detection_rate_hz == ts::kSampleRate16kHz ||
detection_rate_hz == ts::kSampleRate32kHz ||
detection_rate_hz == ts::kSampleRate48kHz);
RTC_DCHECK_GT(num_channels, 0);
switch (sample_rate_hz) {
case ts::kSampleRate8kHz:
analysis_length_ = 128u;
@ -87,26 +117,18 @@ int TransientSuppressorImpl::Initialize(int sample_rate_hz,
window_ = kBlocks480w1024;
break;
default:
return -1;
}
if (detection_rate_hz != ts::kSampleRate8kHz &&
detection_rate_hz != ts::kSampleRate16kHz &&
detection_rate_hz != ts::kSampleRate32kHz &&
detection_rate_hz != ts::kSampleRate48kHz) {
return -1;
}
if (num_channels <= 0) {
return -1;
RTC_DCHECK_NOTREACHED();
return;
}
detector_.reset(new TransientDetector(detection_rate_hz));
data_length_ = sample_rate_hz * ts::kChunkSizeMs / 1000;
if (data_length_ > analysis_length_) {
RTC_NOTREACHED();
return -1;
}
RTC_DCHECK_LE(data_length_, analysis_length_);
buffer_delay_ = analysis_length_ - data_length_;
voice_probability_delay_unit_.Initialize(/*delay_num_samples=*/buffer_delay_,
sample_rate_hz);
complex_analysis_length_ = analysis_length_ / 2 + 1;
RTC_DCHECK_GE(complex_analysis_length_, kMaxVoiceBin);
num_channels_ = num_channels;
@ -155,28 +177,28 @@ int TransientSuppressorImpl::Initialize(int sample_rate_hz,
chunks_since_voice_change_ = 0;
seed_ = 182;
using_reference_ = false;
return 0;
}
int TransientSuppressorImpl::Suppress(float* data,
size_t data_length,
int num_channels,
const float* detection_data,
size_t detection_length,
const float* reference_data,
size_t reference_length,
float voice_probability,
bool key_pressed) {
float TransientSuppressorImpl::Suppress(float* data,
size_t data_length,
int num_channels,
const float* detection_data,
size_t detection_length,
const float* reference_data,
size_t reference_length,
float voice_probability,
bool key_pressed) {
if (!data || data_length != data_length_ || num_channels != num_channels_ ||
detection_length != detection_length_ || voice_probability < 0 ||
voice_probability > 1) {
return -1;
// The audio is not modified, so the voice probability is returned as is
// (delay not applied).
return voice_probability;
}
UpdateKeypress(key_pressed);
UpdateBuffers(data);
int result = 0;
if (detection_enabled_) {
UpdateRestoration(voice_probability);
@ -189,12 +211,14 @@ int TransientSuppressorImpl::Suppress(float* data,
float detector_result = detector_->Detect(detection_data, detection_length,
reference_data, reference_length);
if (detector_result < 0) {
return -1;
// The audio is not modified, so the voice probability is returned as is
// (delay not applied).
return voice_probability;
}
using_reference_ = detector_->using_reference();
// |detector_smoothed_| follows the |detector_result| when this last one is
// `detector_smoothed_` follows the `detector_result` when this last one is
// increasing, but has an exponential decaying tail to be able to suppress
// the ringing of keyclicks.
float smooth_factor = using_reference_ ? 0.6 : 0.1;
@ -219,11 +243,13 @@ int TransientSuppressorImpl::Suppress(float* data,
: &in_buffer_[i * analysis_length_],
data_length_ * sizeof(*data));
}
return result;
// The audio has been modified, return the delayed voice probability.
return voice_probability_delay_unit_.Delay(voice_probability);
}
// This should only be called when detection is enabled. UpdateBuffers() must
// have been called. At return, |out_buffer_| will be filled with the
// have been called. At return, `out_buffer_` will be filled with the
// processed output.
void TransientSuppressorImpl::Suppress(float* in_ptr,
float* spectral_mean,
@ -304,16 +330,34 @@ void TransientSuppressorImpl::UpdateKeypress(bool key_pressed) {
}
void TransientSuppressorImpl::UpdateRestoration(float voice_probability) {
const int kHardRestorationOffsetDelay = 3;
const int kHardRestorationOnsetDelay = 80;
bool not_voiced = voice_probability < kVoiceThreshold;
bool not_voiced;
switch (vad_mode_) {
case TransientSuppressor::VadMode::kDefault: {
constexpr float kVoiceThreshold = 0.02f;
not_voiced = voice_probability < kVoiceThreshold;
break;
}
case TransientSuppressor::VadMode::kRnnVad: {
constexpr float kVoiceThreshold = 0.7f;
not_voiced = voice_probability < kVoiceThreshold;
break;
}
case TransientSuppressor::VadMode::kNoVad:
// Always assume that voice is detected.
not_voiced = false;
break;
}
if (not_voiced == use_hard_restoration_) {
chunks_since_voice_change_ = 0;
} else {
++chunks_since_voice_change_;
// Number of 10 ms frames to wait to transition to and from hard
// restoration.
constexpr int kHardRestorationOffsetDelay = 3;
constexpr int kHardRestorationOnsetDelay = 80;
if ((use_hard_restoration_ &&
chunks_since_voice_change_ > kHardRestorationOffsetDelay) ||
(!use_hard_restoration_ &&
@ -325,7 +369,7 @@ void TransientSuppressorImpl::UpdateRestoration(float voice_probability) {
}
// Shift buffers to make way for new data. Must be called after
// |detection_enabled_| is updated by UpdateKeypress().
// `detection_enabled_` is updated by UpdateKeypress().
void TransientSuppressorImpl::UpdateBuffers(float* data) {
// TODO(aluebs): Change to ring buffer.
memmove(in_buffer_.get(), &in_buffer_[data_length_],
@ -350,9 +394,9 @@ void TransientSuppressorImpl::UpdateBuffers(float* data) {
}
// Restores the unvoiced signal if a click is present.
// Attenuates by a certain factor every peak in the |fft_buffer_| that exceeds
// the spectral mean. The attenuation depends on |detector_smoothed_|.
// If a restoration takes place, the |magnitudes_| are updated to the new value.
// Attenuates by a certain factor every peak in the `fft_buffer_` that exceeds
// the spectral mean. The attenuation depends on `detector_smoothed_`.
// If a restoration takes place, the `magnitudes_` are updated to the new value.
void TransientSuppressorImpl::HardRestoration(float* spectral_mean) {
const float detector_result =
1.f - std::pow(1.f - detector_smoothed_, using_reference_ ? 200.f : 50.f);
@ -376,10 +420,10 @@ void TransientSuppressorImpl::HardRestoration(float* spectral_mean) {
}
// Restores the voiced signal if a click is present.
// Attenuates by a certain factor every peak in the |fft_buffer_| that exceeds
// Attenuates by a certain factor every peak in the `fft_buffer_` that exceeds
// the spectral mean and that is lower than some function of the current block
// frequency mean. The attenuation depends on |detector_smoothed_|.
// If a restoration takes place, the |magnitudes_| are updated to the new value.
// frequency mean. The attenuation depends on `detector_smoothed_`.
// If a restoration takes place, the `magnitudes_` are updated to the new value.
void TransientSuppressorImpl::SoftRestoration(float* spectral_mean) {
// Get the spectral magnitude mean of the current block.
float block_frequency_mean = 0;