Update to current webrtc library

This is from the upstream library commit id
3326535126e435f1ba647885ce43a8f0f3d317eb, corresponding to Chromium
88.0.4290.1.
This commit is contained in:
Arun Raghavan
2020-10-12 18:08:02 -04:00
parent b1b02581d3
commit bcec8b0b21
859 changed files with 76187 additions and 49580 deletions

View File

@ -0,0 +1,233 @@
# Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
#
# Use of this source code is governed by a BSD-style license
# that can be found in the LICENSE file in the root of the source
# tree. An additional intellectual property rights grant can be found
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
import("../../../../webrtc.gni")
rtc_library("rnn_vad") {
visibility = [ "../*" ]
sources = [
"features_extraction.cc",
"features_extraction.h",
"rnn.cc",
"rnn.h",
]
if (rtc_build_with_neon && current_cpu != "arm64") {
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
cflags = [ "-mfpu=neon" ]
}
deps = [
":rnn_vad_common",
":rnn_vad_lp_residual",
":rnn_vad_pitch",
":rnn_vad_sequence_buffer",
":rnn_vad_spectral_features",
"..:biquad_filter",
"../../../../api:array_view",
"../../../../api:function_view",
"../../../../rtc_base:checks",
"../../../../rtc_base:logging",
"../../../../rtc_base/system:arch",
"//third_party/rnnoise:rnn_vad",
]
}
rtc_library("rnn_vad_auto_correlation") {
sources = [
"auto_correlation.cc",
"auto_correlation.h",
]
deps = [
":rnn_vad_common",
"../../../../api:array_view",
"../../../../rtc_base:checks",
"../../utility:pffft_wrapper",
]
}
rtc_library("rnn_vad_common") {
# TODO(alessiob): Make this target visibility private.
visibility = [
":*",
"..:rnn_vad_with_level",
]
sources = [
"common.cc",
"common.h",
]
deps = [
"../../../../rtc_base/system:arch",
"../../../../system_wrappers",
]
}
rtc_library("rnn_vad_lp_residual") {
sources = [
"lp_residual.cc",
"lp_residual.h",
]
deps = [
"../../../../api:array_view",
"../../../../rtc_base:checks",
]
}
rtc_library("rnn_vad_pitch") {
sources = [
"pitch_info.h",
"pitch_search.cc",
"pitch_search.h",
"pitch_search_internal.cc",
"pitch_search_internal.h",
]
deps = [
":rnn_vad_auto_correlation",
":rnn_vad_common",
"../../../../api:array_view",
"../../../../rtc_base:checks",
]
}
rtc_source_set("rnn_vad_ring_buffer") {
sources = [ "ring_buffer.h" ]
deps = [
"../../../../api:array_view",
"../../../../rtc_base:checks",
]
}
rtc_source_set("rnn_vad_sequence_buffer") {
sources = [ "sequence_buffer.h" ]
deps = [
"../../../../api:array_view",
"../../../../rtc_base:checks",
]
}
rtc_library("rnn_vad_spectral_features") {
sources = [
"spectral_features.cc",
"spectral_features.h",
"spectral_features_internal.cc",
"spectral_features_internal.h",
]
deps = [
":rnn_vad_common",
":rnn_vad_ring_buffer",
":rnn_vad_symmetric_matrix_buffer",
"../../../../api:array_view",
"../../../../rtc_base:checks",
"../../utility:pffft_wrapper",
]
}
rtc_source_set("rnn_vad_symmetric_matrix_buffer") {
sources = [ "symmetric_matrix_buffer.h" ]
deps = [
"../../../../api:array_view",
"../../../../rtc_base:checks",
]
}
if (rtc_include_tests) {
rtc_library("test_utils") {
testonly = true
sources = [
"test_utils.cc",
"test_utils.h",
]
deps = [
":rnn_vad",
":rnn_vad_common",
"../../../../api:array_view",
"../../../../api:scoped_refptr",
"../../../../rtc_base:checks",
"../../../../rtc_base/system:arch",
"../../../../system_wrappers",
"../../../../test:fileutils",
"../../../../test:test_support",
]
}
unittest_resources = [
"../../../../resources/audio_processing/agc2/rnn_vad/band_energies.dat",
"../../../../resources/audio_processing/agc2/rnn_vad/pitch_buf_24k.dat",
"../../../../resources/audio_processing/agc2/rnn_vad/pitch_lp_res.dat",
"../../../../resources/audio_processing/agc2/rnn_vad/pitch_search_int.dat",
"../../../../resources/audio_processing/agc2/rnn_vad/samples.pcm",
"../../../../resources/audio_processing/agc2/rnn_vad/vad_prob.dat",
]
if (is_ios) {
bundle_data("unittests_bundle_data") {
testonly = true
sources = unittest_resources
outputs = [ "{{bundle_resources_dir}}/{{source_file_part}}" ]
}
}
rtc_library("unittests") {
testonly = true
sources = [
"auto_correlation_unittest.cc",
"features_extraction_unittest.cc",
"lp_residual_unittest.cc",
"pitch_search_internal_unittest.cc",
"pitch_search_unittest.cc",
"ring_buffer_unittest.cc",
"rnn_unittest.cc",
"rnn_vad_unittest.cc",
"sequence_buffer_unittest.cc",
"spectral_features_internal_unittest.cc",
"spectral_features_unittest.cc",
"symmetric_matrix_buffer_unittest.cc",
]
deps = [
":rnn_vad",
":rnn_vad_auto_correlation",
":rnn_vad_common",
":rnn_vad_lp_residual",
":rnn_vad_pitch",
":rnn_vad_ring_buffer",
":rnn_vad_sequence_buffer",
":rnn_vad_spectral_features",
":rnn_vad_symmetric_matrix_buffer",
":test_utils",
"../..:audioproc_test_utils",
"../../../../api:array_view",
"../../../../common_audio/",
"../../../../rtc_base:checks",
"../../../../rtc_base:logging",
"../../../../rtc_base/system:arch",
"../../../../test:test_support",
"../../utility:pffft_wrapper",
"//third_party/rnnoise:rnn_vad",
]
absl_deps = [ "//third_party/abseil-cpp/absl/memory" ]
data = unittest_resources
if (is_ios) {
deps += [ ":unittests_bundle_data" ]
}
}
rtc_executable("rnn_vad_tool") {
testonly = true
sources = [ "rnn_vad_tool.cc" ]
deps = [
":rnn_vad",
":rnn_vad_common",
"../../../../api:array_view",
"../../../../common_audio",
"../../../../rtc_base:rtc_base_approved",
"../../../../test:test_support",
"//third_party/abseil-cpp/absl/flags:flag",
"//third_party/abseil-cpp/absl/flags:parse",
]
}
}

View File

@ -0,0 +1,92 @@
/*
* Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/rnn_vad/auto_correlation.h"
#include <algorithm>
#include "rtc_base/checks.h"
namespace webrtc {
namespace rnn_vad {
namespace {
constexpr int kAutoCorrelationFftOrder = 9; // Length-512 FFT.
static_assert(1 << kAutoCorrelationFftOrder >
kNumInvertedLags12kHz + kBufSize12kHz - kMaxPitch12kHz,
"");
} // namespace
AutoCorrelationCalculator::AutoCorrelationCalculator()
: fft_(1 << kAutoCorrelationFftOrder, Pffft::FftType::kReal),
tmp_(fft_.CreateBuffer()),
X_(fft_.CreateBuffer()),
H_(fft_.CreateBuffer()) {}
AutoCorrelationCalculator::~AutoCorrelationCalculator() = default;
// The auto-correlations coefficients are computed as follows:
// |.........|...........| <- pitch buffer
// [ x (fixed) ]
// [ y_0 ]
// [ y_{m-1} ]
// x and y are sub-array of equal length; x is never moved, whereas y slides.
// The cross-correlation between y_0 and x corresponds to the auto-correlation
// for the maximum pitch period. Hence, the first value in |auto_corr| has an
// inverted lag equal to 0 that corresponds to a lag equal to the maximum
// pitch period.
void AutoCorrelationCalculator::ComputeOnPitchBuffer(
rtc::ArrayView<const float, kBufSize12kHz> pitch_buf,
rtc::ArrayView<float, kNumInvertedLags12kHz> auto_corr) {
RTC_DCHECK_LT(auto_corr.size(), kMaxPitch12kHz);
RTC_DCHECK_GT(pitch_buf.size(), kMaxPitch12kHz);
constexpr size_t kFftFrameSize = 1 << kAutoCorrelationFftOrder;
constexpr size_t kConvolutionLength = kBufSize12kHz - kMaxPitch12kHz;
static_assert(kConvolutionLength == kFrameSize20ms12kHz,
"Mismatch between pitch buffer size, frame size and maximum "
"pitch period.");
static_assert(kFftFrameSize > kNumInvertedLags12kHz + kConvolutionLength,
"The FFT length is not sufficiently big to avoid cyclic "
"convolution errors.");
auto tmp = tmp_->GetView();
// Compute the FFT for the reversed reference frame - i.e.,
// pitch_buf[-kConvolutionLength:].
std::reverse_copy(pitch_buf.end() - kConvolutionLength, pitch_buf.end(),
tmp.begin());
std::fill(tmp.begin() + kConvolutionLength, tmp.end(), 0.f);
fft_.ForwardTransform(*tmp_, H_.get(), /*ordered=*/false);
// Compute the FFT for the sliding frames chunk. The sliding frames are
// defined as pitch_buf[i:i+kConvolutionLength] where i in
// [0, kNumInvertedLags12kHz). The chunk includes all of them, hence it is
// defined as pitch_buf[:kNumInvertedLags12kHz+kConvolutionLength].
std::copy(pitch_buf.begin(),
pitch_buf.begin() + kConvolutionLength + kNumInvertedLags12kHz,
tmp.begin());
std::fill(tmp.begin() + kNumInvertedLags12kHz + kConvolutionLength, tmp.end(),
0.f);
fft_.ForwardTransform(*tmp_, X_.get(), /*ordered=*/false);
// Convolve in the frequency domain.
constexpr float kScalingFactor = 1.f / static_cast<float>(kFftFrameSize);
std::fill(tmp.begin(), tmp.end(), 0.f);
fft_.FrequencyDomainConvolve(*X_, *H_, tmp_.get(), kScalingFactor);
fft_.BackwardTransform(*tmp_, tmp_.get(), /*ordered=*/false);
// Extract the auto-correlation coefficients.
std::copy(tmp.begin() + kConvolutionLength - 1,
tmp.begin() + kConvolutionLength + kNumInvertedLags12kHz - 1,
auto_corr.begin());
}
} // namespace rnn_vad
} // namespace webrtc

View File

@ -0,0 +1,49 @@
/*
* Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_AUTO_CORRELATION_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_AUTO_CORRELATION_H_
#include <memory>
#include "api/array_view.h"
#include "modules/audio_processing/agc2/rnn_vad/common.h"
#include "modules/audio_processing/utility/pffft_wrapper.h"
namespace webrtc {
namespace rnn_vad {
// Class to compute the auto correlation on the pitch buffer for a target pitch
// interval.
class AutoCorrelationCalculator {
public:
AutoCorrelationCalculator();
AutoCorrelationCalculator(const AutoCorrelationCalculator&) = delete;
AutoCorrelationCalculator& operator=(const AutoCorrelationCalculator&) =
delete;
~AutoCorrelationCalculator();
// Computes the auto-correlation coefficients for a target pitch interval.
// |auto_corr| indexes are inverted lags.
void ComputeOnPitchBuffer(
rtc::ArrayView<const float, kBufSize12kHz> pitch_buf,
rtc::ArrayView<float, kNumInvertedLags12kHz> auto_corr);
private:
Pffft fft_;
std::unique_ptr<Pffft::FloatBuffer> tmp_;
std::unique_ptr<Pffft::FloatBuffer> X_;
std::unique_ptr<Pffft::FloatBuffer> H_;
};
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_AUTO_CORRELATION_H_

View File

@ -0,0 +1,34 @@
/*
* Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/rnn_vad/common.h"
#include "rtc_base/system/arch.h"
#include "system_wrappers/include/cpu_features_wrapper.h"
namespace webrtc {
namespace rnn_vad {
Optimization DetectOptimization() {
#if defined(WEBRTC_ARCH_X86_FAMILY)
if (GetCPUInfo(kSSE2) != 0) {
return Optimization::kSse2;
}
#endif
#if defined(WEBRTC_HAS_NEON)
return Optimization::kNeon;
#endif
return Optimization::kNone;
}
} // namespace rnn_vad
} // namespace webrtc

View File

@ -0,0 +1,76 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_COMMON_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_COMMON_H_
#include <stddef.h>
namespace webrtc {
namespace rnn_vad {
constexpr double kPi = 3.14159265358979323846;
constexpr size_t kSampleRate24kHz = 24000;
constexpr size_t kFrameSize10ms24kHz = kSampleRate24kHz / 100;
constexpr size_t kFrameSize20ms24kHz = kFrameSize10ms24kHz * 2;
// Pitch buffer.
constexpr size_t kMinPitch24kHz = kSampleRate24kHz / 800; // 0.00125 s.
constexpr size_t kMaxPitch24kHz = kSampleRate24kHz / 62.5; // 0.016 s.
constexpr size_t kBufSize24kHz = kMaxPitch24kHz + kFrameSize20ms24kHz;
static_assert((kBufSize24kHz & 1) == 0, "The buffer size must be even.");
// 24 kHz analysis.
// Define a higher minimum pitch period for the initial search. This is used to
// avoid searching for very short periods, for which a refinement step is
// responsible.
constexpr size_t kInitialMinPitch24kHz = 3 * kMinPitch24kHz;
static_assert(kMinPitch24kHz < kInitialMinPitch24kHz, "");
static_assert(kInitialMinPitch24kHz < kMaxPitch24kHz, "");
static_assert(kMaxPitch24kHz > kInitialMinPitch24kHz, "");
constexpr size_t kNumInvertedLags24kHz = kMaxPitch24kHz - kInitialMinPitch24kHz;
// 12 kHz analysis.
constexpr size_t kSampleRate12kHz = 12000;
constexpr size_t kFrameSize10ms12kHz = kSampleRate12kHz / 100;
constexpr size_t kFrameSize20ms12kHz = kFrameSize10ms12kHz * 2;
constexpr size_t kBufSize12kHz = kBufSize24kHz / 2;
constexpr size_t kInitialMinPitch12kHz = kInitialMinPitch24kHz / 2;
constexpr size_t kMaxPitch12kHz = kMaxPitch24kHz / 2;
static_assert(kMaxPitch12kHz > kInitialMinPitch12kHz, "");
// The inverted lags for the pitch interval [|kInitialMinPitch12kHz|,
// |kMaxPitch12kHz|] are in the range [0, |kNumInvertedLags12kHz|].
constexpr size_t kNumInvertedLags12kHz = kMaxPitch12kHz - kInitialMinPitch12kHz;
// 48 kHz constants.
constexpr size_t kMinPitch48kHz = kMinPitch24kHz * 2;
constexpr size_t kMaxPitch48kHz = kMaxPitch24kHz * 2;
// Spectral features.
constexpr size_t kNumBands = 22;
constexpr size_t kNumLowerBands = 6;
static_assert((0 < kNumLowerBands) && (kNumLowerBands < kNumBands), "");
constexpr size_t kCepstralCoeffsHistorySize = 8;
static_assert(kCepstralCoeffsHistorySize > 2,
"The history size must at least be 3 to compute first and second "
"derivatives.");
constexpr size_t kFeatureVectorSize = 42;
enum class Optimization { kNone, kSse2, kNeon };
// Detects what kind of optimizations to use for the code.
Optimization DetectOptimization();
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_COMMON_H_

View File

@ -0,0 +1,90 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/rnn_vad/features_extraction.h"
#include <array>
#include "modules/audio_processing/agc2/rnn_vad/lp_residual.h"
#include "rtc_base/checks.h"
namespace webrtc {
namespace rnn_vad {
namespace {
// Generated via "B, A = scipy.signal.butter(2, 30/12000, btype='highpass')"
const BiQuadFilter::BiQuadCoefficients kHpfConfig24k = {
{0.99446179f, -1.98892358f, 0.99446179f},
{-1.98889291f, 0.98895425f}};
} // namespace
FeaturesExtractor::FeaturesExtractor()
: use_high_pass_filter_(false),
pitch_buf_24kHz_(),
pitch_buf_24kHz_view_(pitch_buf_24kHz_.GetBufferView()),
lp_residual_(kBufSize24kHz),
lp_residual_view_(lp_residual_.data(), kBufSize24kHz),
pitch_estimator_(),
reference_frame_view_(pitch_buf_24kHz_.GetMostRecentValuesView()) {
RTC_DCHECK_EQ(kBufSize24kHz, lp_residual_.size());
hpf_.Initialize(kHpfConfig24k);
Reset();
}
FeaturesExtractor::~FeaturesExtractor() = default;
void FeaturesExtractor::Reset() {
pitch_buf_24kHz_.Reset();
spectral_features_extractor_.Reset();
if (use_high_pass_filter_)
hpf_.Reset();
}
bool FeaturesExtractor::CheckSilenceComputeFeatures(
rtc::ArrayView<const float, kFrameSize10ms24kHz> samples,
rtc::ArrayView<float, kFeatureVectorSize> feature_vector) {
// Pre-processing.
if (use_high_pass_filter_) {
std::array<float, kFrameSize10ms24kHz> samples_filtered;
hpf_.Process(samples, samples_filtered);
// Feed buffer with the pre-processed version of |samples|.
pitch_buf_24kHz_.Push(samples_filtered);
} else {
// Feed buffer with |samples|.
pitch_buf_24kHz_.Push(samples);
}
// Extract the LP residual.
float lpc_coeffs[kNumLpcCoefficients];
ComputeAndPostProcessLpcCoefficients(pitch_buf_24kHz_view_, lpc_coeffs);
ComputeLpResidual(lpc_coeffs, pitch_buf_24kHz_view_, lp_residual_view_);
// Estimate pitch on the LP-residual and write the normalized pitch period
// into the output vector (normalization based on training data stats).
pitch_info_48kHz_ = pitch_estimator_.Estimate(lp_residual_view_);
feature_vector[kFeatureVectorSize - 2] =
0.01f * (static_cast<int>(pitch_info_48kHz_.period) - 300);
// Extract lagged frames (according to the estimated pitch period).
RTC_DCHECK_LE(pitch_info_48kHz_.period / 2, kMaxPitch24kHz);
auto lagged_frame = pitch_buf_24kHz_view_.subview(
kMaxPitch24kHz - pitch_info_48kHz_.period / 2, kFrameSize20ms24kHz);
// Analyze reference and lagged frames checking if silence has been detected
// and write the feature vector.
return spectral_features_extractor_.CheckSilenceComputeFeatures(
reference_frame_view_, {lagged_frame.data(), kFrameSize20ms24kHz},
{feature_vector.data() + kNumLowerBands, kNumBands - kNumLowerBands},
{feature_vector.data(), kNumLowerBands},
{feature_vector.data() + kNumBands, kNumLowerBands},
{feature_vector.data() + kNumBands + kNumLowerBands, kNumLowerBands},
{feature_vector.data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands},
&feature_vector[kFeatureVectorSize - 1]);
}
} // namespace rnn_vad
} // namespace webrtc

View File

@ -0,0 +1,62 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_FEATURES_EXTRACTION_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_FEATURES_EXTRACTION_H_
#include <vector>
#include "api/array_view.h"
#include "modules/audio_processing/agc2/biquad_filter.h"
#include "modules/audio_processing/agc2/rnn_vad/common.h"
#include "modules/audio_processing/agc2/rnn_vad/pitch_info.h"
#include "modules/audio_processing/agc2/rnn_vad/pitch_search.h"
#include "modules/audio_processing/agc2/rnn_vad/sequence_buffer.h"
#include "modules/audio_processing/agc2/rnn_vad/spectral_features.h"
namespace webrtc {
namespace rnn_vad {
// Feature extractor to feed the VAD RNN.
class FeaturesExtractor {
public:
FeaturesExtractor();
FeaturesExtractor(const FeaturesExtractor&) = delete;
FeaturesExtractor& operator=(const FeaturesExtractor&) = delete;
~FeaturesExtractor();
void Reset();
// Analyzes the samples, computes the feature vector and returns true if
// silence is detected (false if not). When silence is detected,
// |feature_vector| is partially written and therefore must not be used to
// feed the VAD RNN.
bool CheckSilenceComputeFeatures(
rtc::ArrayView<const float, kFrameSize10ms24kHz> samples,
rtc::ArrayView<float, kFeatureVectorSize> feature_vector);
private:
const bool use_high_pass_filter_;
// TODO(bugs.webrtc.org/7494): Remove HPF depending on how AGC2 is used in APM
// and on whether an HPF is already used as pre-processing step in APM.
BiQuadFilter hpf_;
SequenceBuffer<float, kBufSize24kHz, kFrameSize10ms24kHz, kFrameSize20ms24kHz>
pitch_buf_24kHz_;
rtc::ArrayView<const float, kBufSize24kHz> pitch_buf_24kHz_view_;
std::vector<float> lp_residual_;
rtc::ArrayView<float, kBufSize24kHz> lp_residual_view_;
PitchEstimator pitch_estimator_;
rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame_view_;
SpectralFeaturesExtractor spectral_features_extractor_;
PitchInfo pitch_info_48kHz_;
};
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_FEATURES_EXTRACTION_H_

View File

@ -0,0 +1,138 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/rnn_vad/lp_residual.h"
#include <algorithm>
#include <array>
#include <cmath>
#include <numeric>
#include "rtc_base/checks.h"
namespace webrtc {
namespace rnn_vad {
namespace {
// Computes cross-correlation coefficients between |x| and |y| and writes them
// in |x_corr|. The lag values are in {0, ..., max_lag - 1}, where max_lag
// equals the size of |x_corr|.
// The |x| and |y| sub-arrays used to compute a cross-correlation coefficients
// for a lag l have both size "size of |x| - l" - i.e., the longest sub-array is
// used. |x| and |y| must have the same size.
void ComputeCrossCorrelation(
rtc::ArrayView<const float> x,
rtc::ArrayView<const float> y,
rtc::ArrayView<float, kNumLpcCoefficients> x_corr) {
constexpr size_t max_lag = x_corr.size();
RTC_DCHECK_EQ(x.size(), y.size());
RTC_DCHECK_LT(max_lag, x.size());
for (size_t lag = 0; lag < max_lag; ++lag) {
x_corr[lag] =
std::inner_product(x.begin(), x.end() - lag, y.begin() + lag, 0.f);
}
}
// Applies denoising to the auto-correlation coefficients.
void DenoiseAutoCorrelation(
rtc::ArrayView<float, kNumLpcCoefficients> auto_corr) {
// Assume -40 dB white noise floor.
auto_corr[0] *= 1.0001f;
for (size_t i = 1; i < kNumLpcCoefficients; ++i) {
auto_corr[i] -= auto_corr[i] * (0.008f * i) * (0.008f * i);
}
}
// Computes the initial inverse filter coefficients given the auto-correlation
// coefficients of an input frame.
void ComputeInitialInverseFilterCoefficients(
rtc::ArrayView<const float, kNumLpcCoefficients> auto_corr,
rtc::ArrayView<float, kNumLpcCoefficients - 1> lpc_coeffs) {
float error = auto_corr[0];
for (size_t i = 0; i < kNumLpcCoefficients - 1; ++i) {
float reflection_coeff = 0.f;
for (size_t j = 0; j < i; ++j) {
reflection_coeff += lpc_coeffs[j] * auto_corr[i - j];
}
reflection_coeff += auto_corr[i + 1];
// Avoid division by numbers close to zero.
constexpr float kMinErrorMagnitude = 1e-6f;
if (std::fabs(error) < kMinErrorMagnitude) {
error = std::copysign(kMinErrorMagnitude, error);
}
reflection_coeff /= -error;
// Update LPC coefficients and total error.
lpc_coeffs[i] = reflection_coeff;
for (size_t j = 0; j<(i + 1)>> 1; ++j) {
const float tmp1 = lpc_coeffs[j];
const float tmp2 = lpc_coeffs[i - 1 - j];
lpc_coeffs[j] = tmp1 + reflection_coeff * tmp2;
lpc_coeffs[i - 1 - j] = tmp2 + reflection_coeff * tmp1;
}
error -= reflection_coeff * reflection_coeff * error;
if (error < 0.001f * auto_corr[0]) {
break;
}
}
}
} // namespace
void ComputeAndPostProcessLpcCoefficients(
rtc::ArrayView<const float> x,
rtc::ArrayView<float, kNumLpcCoefficients> lpc_coeffs) {
std::array<float, kNumLpcCoefficients> auto_corr;
ComputeCrossCorrelation(x, x, {auto_corr.data(), auto_corr.size()});
if (auto_corr[0] == 0.f) { // Empty frame.
std::fill(lpc_coeffs.begin(), lpc_coeffs.end(), 0);
return;
}
DenoiseAutoCorrelation({auto_corr.data(), auto_corr.size()});
std::array<float, kNumLpcCoefficients - 1> lpc_coeffs_pre{};
ComputeInitialInverseFilterCoefficients(auto_corr, lpc_coeffs_pre);
// LPC coefficients post-processing.
// TODO(bugs.webrtc.org/9076): Consider removing these steps.
float c1 = 1.f;
for (size_t i = 0; i < kNumLpcCoefficients - 1; ++i) {
c1 *= 0.9f;
lpc_coeffs_pre[i] *= c1;
}
const float c2 = 0.8f;
lpc_coeffs[0] = lpc_coeffs_pre[0] + c2;
lpc_coeffs[1] = lpc_coeffs_pre[1] + c2 * lpc_coeffs_pre[0];
lpc_coeffs[2] = lpc_coeffs_pre[2] + c2 * lpc_coeffs_pre[1];
lpc_coeffs[3] = lpc_coeffs_pre[3] + c2 * lpc_coeffs_pre[2];
lpc_coeffs[4] = c2 * lpc_coeffs_pre[3];
}
void ComputeLpResidual(
rtc::ArrayView<const float, kNumLpcCoefficients> lpc_coeffs,
rtc::ArrayView<const float> x,
rtc::ArrayView<float> y) {
RTC_DCHECK_LT(kNumLpcCoefficients, x.size());
RTC_DCHECK_EQ(x.size(), y.size());
std::array<float, kNumLpcCoefficients> input_chunk;
input_chunk.fill(0.f);
for (size_t i = 0; i < y.size(); ++i) {
const float sum = std::inner_product(input_chunk.begin(), input_chunk.end(),
lpc_coeffs.begin(), x[i]);
// Circular shift and add a new sample.
for (size_t j = kNumLpcCoefficients - 1; j > 0; --j)
input_chunk[j] = input_chunk[j - 1];
input_chunk[0] = x[i];
// Copy result.
y[i] = sum;
}
}
} // namespace rnn_vad
} // namespace webrtc

View File

@ -0,0 +1,41 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_LP_RESIDUAL_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_LP_RESIDUAL_H_
#include <stddef.h>
#include "api/array_view.h"
namespace webrtc {
namespace rnn_vad {
// LPC inverse filter length.
constexpr size_t kNumLpcCoefficients = 5;
// Given a frame |x|, computes a post-processed version of LPC coefficients
// tailored for pitch estimation.
void ComputeAndPostProcessLpcCoefficients(
rtc::ArrayView<const float> x,
rtc::ArrayView<float, kNumLpcCoefficients> lpc_coeffs);
// Computes the LP residual for the input frame |x| and the LPC coefficients
// |lpc_coeffs|. |y| and |x| can point to the same array for in-place
// computation.
void ComputeLpResidual(
rtc::ArrayView<const float, kNumLpcCoefficients> lpc_coeffs,
rtc::ArrayView<const float> x,
rtc::ArrayView<float> y);
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_LP_RESIDUAL_H_

View File

@ -0,0 +1,29 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_INFO_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_INFO_H_
namespace webrtc {
namespace rnn_vad {
// Stores pitch period and gain information. The pitch gain measures the
// strength of the pitch (the higher, the stronger).
struct PitchInfo {
PitchInfo() : period(0), gain(0.f) {}
PitchInfo(int p, float g) : period(p), gain(g) {}
int period;
float gain;
};
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_INFO_H_

View File

@ -0,0 +1,56 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/rnn_vad/pitch_search.h"
#include <array>
#include <cstddef>
#include "rtc_base/checks.h"
namespace webrtc {
namespace rnn_vad {
PitchEstimator::PitchEstimator()
: pitch_buf_decimated_(kBufSize12kHz),
pitch_buf_decimated_view_(pitch_buf_decimated_.data(), kBufSize12kHz),
auto_corr_(kNumInvertedLags12kHz),
auto_corr_view_(auto_corr_.data(), kNumInvertedLags12kHz) {
RTC_DCHECK_EQ(kBufSize12kHz, pitch_buf_decimated_.size());
RTC_DCHECK_EQ(kNumInvertedLags12kHz, auto_corr_view_.size());
}
PitchEstimator::~PitchEstimator() = default;
PitchInfo PitchEstimator::Estimate(
rtc::ArrayView<const float, kBufSize24kHz> pitch_buf) {
// Perform the initial pitch search at 12 kHz.
Decimate2x(pitch_buf, pitch_buf_decimated_view_);
auto_corr_calculator_.ComputeOnPitchBuffer(pitch_buf_decimated_view_,
auto_corr_view_);
std::array<size_t, 2> pitch_candidates_inv_lags = FindBestPitchPeriods(
auto_corr_view_, pitch_buf_decimated_view_, kMaxPitch12kHz);
// Refine the pitch period estimation.
// The refinement is done using the pitch buffer that contains 24 kHz samples.
// Therefore, adapt the inverted lags in |pitch_candidates_inv_lags| from 12
// to 24 kHz.
pitch_candidates_inv_lags[0] *= 2;
pitch_candidates_inv_lags[1] *= 2;
size_t pitch_inv_lag_48kHz =
RefinePitchPeriod48kHz(pitch_buf, pitch_candidates_inv_lags);
// Look for stronger harmonics to find the final pitch period and its gain.
RTC_DCHECK_LT(pitch_inv_lag_48kHz, kMaxPitch48kHz);
last_pitch_48kHz_ = CheckLowerPitchPeriodsAndComputePitchGain(
pitch_buf, kMaxPitch48kHz - pitch_inv_lag_48kHz, last_pitch_48kHz_);
return last_pitch_48kHz_;
}
} // namespace rnn_vad
} // namespace webrtc

View File

@ -0,0 +1,49 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_H_
#include <memory>
#include <vector>
#include "api/array_view.h"
#include "modules/audio_processing/agc2/rnn_vad/auto_correlation.h"
#include "modules/audio_processing/agc2/rnn_vad/common.h"
#include "modules/audio_processing/agc2/rnn_vad/pitch_info.h"
#include "modules/audio_processing/agc2/rnn_vad/pitch_search_internal.h"
namespace webrtc {
namespace rnn_vad {
// Pitch estimator.
class PitchEstimator {
public:
PitchEstimator();
PitchEstimator(const PitchEstimator&) = delete;
PitchEstimator& operator=(const PitchEstimator&) = delete;
~PitchEstimator();
// Estimates the pitch period and gain. Returns the pitch estimation data for
// 48 kHz.
PitchInfo Estimate(rtc::ArrayView<const float, kBufSize24kHz> pitch_buf);
private:
PitchInfo last_pitch_48kHz_;
AutoCorrelationCalculator auto_corr_calculator_;
std::vector<float> pitch_buf_decimated_;
rtc::ArrayView<float, kBufSize12kHz> pitch_buf_decimated_view_;
std::vector<float> auto_corr_;
rtc::ArrayView<float, kNumInvertedLags12kHz> auto_corr_view_;
};
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_H_

View File

@ -0,0 +1,403 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/rnn_vad/pitch_search_internal.h"
#include <stdlib.h>
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <numeric>
#include "modules/audio_processing/agc2/rnn_vad/common.h"
#include "rtc_base/checks.h"
namespace webrtc {
namespace rnn_vad {
namespace {
// Converts a lag to an inverted lag (only for 24kHz).
size_t GetInvertedLag(size_t lag) {
RTC_DCHECK_LE(lag, kMaxPitch24kHz);
return kMaxPitch24kHz - lag;
}
float ComputeAutoCorrelationCoeff(rtc::ArrayView<const float> pitch_buf,
size_t inv_lag,
size_t max_pitch_period) {
RTC_DCHECK_LT(inv_lag, pitch_buf.size());
RTC_DCHECK_LT(max_pitch_period, pitch_buf.size());
RTC_DCHECK_LE(inv_lag, max_pitch_period);
// TODO(bugs.webrtc.org/9076): Maybe optimize using vectorization.
return std::inner_product(pitch_buf.begin() + max_pitch_period,
pitch_buf.end(), pitch_buf.begin() + inv_lag, 0.f);
}
// Computes a pseudo-interpolation offset for an estimated pitch period |lag| by
// looking at the auto-correlation coefficients in the neighborhood of |lag|.
// (namely, |prev_auto_corr|, |lag_auto_corr| and |next_auto_corr|). The output
// is a lag in {-1, 0, +1}.
// TODO(bugs.webrtc.org/9076): Consider removing pseudo-i since it
// is relevant only if the spectral analysis works at a sample rate that is
// twice as that of the pitch buffer (not so important instead for the estimated
// pitch period feature fed into the RNN).
int GetPitchPseudoInterpolationOffset(size_t lag,
float prev_auto_corr,
float lag_auto_corr,
float next_auto_corr) {
const float& a = prev_auto_corr;
const float& b = lag_auto_corr;
const float& c = next_auto_corr;
int offset = 0;
if ((c - a) > 0.7f * (b - a)) {
offset = 1; // |c| is the largest auto-correlation coefficient.
} else if ((a - c) > 0.7f * (b - c)) {
offset = -1; // |a| is the largest auto-correlation coefficient.
}
return offset;
}
// Refines a pitch period |lag| encoded as lag with pseudo-interpolation. The
// output sample rate is twice as that of |lag|.
size_t PitchPseudoInterpolationLagPitchBuf(
size_t lag,
rtc::ArrayView<const float, kBufSize24kHz> pitch_buf) {
int offset = 0;
// Cannot apply pseudo-interpolation at the boundaries.
if (lag > 0 && lag < kMaxPitch24kHz) {
offset = GetPitchPseudoInterpolationOffset(
lag,
ComputeAutoCorrelationCoeff(pitch_buf, GetInvertedLag(lag - 1),
kMaxPitch24kHz),
ComputeAutoCorrelationCoeff(pitch_buf, GetInvertedLag(lag),
kMaxPitch24kHz),
ComputeAutoCorrelationCoeff(pitch_buf, GetInvertedLag(lag + 1),
kMaxPitch24kHz));
}
return 2 * lag + offset;
}
// Refines a pitch period |inv_lag| encoded as inverted lag with
// pseudo-interpolation. The output sample rate is twice as that of
// |inv_lag|.
size_t PitchPseudoInterpolationInvLagAutoCorr(
size_t inv_lag,
rtc::ArrayView<const float> auto_corr) {
int offset = 0;
// Cannot apply pseudo-interpolation at the boundaries.
if (inv_lag > 0 && inv_lag < auto_corr.size() - 1) {
offset = GetPitchPseudoInterpolationOffset(inv_lag, auto_corr[inv_lag + 1],
auto_corr[inv_lag],
auto_corr[inv_lag - 1]);
}
// TODO(bugs.webrtc.org/9076): When retraining, check if |offset| below should
// be subtracted since |inv_lag| is an inverted lag but offset is a lag.
return 2 * inv_lag + offset;
}
// Integer multipliers used in CheckLowerPitchPeriodsAndComputePitchGain() when
// looking for sub-harmonics.
// The values have been chosen to serve the following algorithm. Given the
// initial pitch period T, we examine whether one of its harmonics is the true
// fundamental frequency. We consider T/k with k in {2, ..., 15}. For each of
// these harmonics, in addition to the pitch gain of itself, we choose one
// multiple of its pitch period, n*T/k, to validate it (by averaging their pitch
// gains). The multiplier n is chosen so that n*T/k is used only one time over
// all k. When for example k = 4, we should also expect a peak at 3*T/4. When
// k = 8 instead we don't want to look at 2*T/8, since we have already checked
// T/4 before. Instead, we look at T*3/8.
// The array can be generate in Python as follows:
// from fractions import Fraction
// # Smallest positive integer not in X.
// def mex(X):
// for i in range(1, int(max(X)+2)):
// if i not in X:
// return i
// # Visited multiples of the period.
// S = {1}
// for n in range(2, 16):
// sn = mex({n * i for i in S} | {1})
// S = S | {Fraction(1, n), Fraction(sn, n)}
// print(sn, end=', ')
constexpr std::array<int, 14> kSubHarmonicMultipliers = {
{3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2}};
// Initial pitch period candidate thresholds for ComputePitchGainThreshold() for
// a sample rate of 24 kHz. Computed as [5*k*k for k in range(16)].
constexpr std::array<int, 14> kInitialPitchPeriodThresholds = {
{20, 45, 80, 125, 180, 245, 320, 405, 500, 605, 720, 845, 980, 1125}};
} // namespace
void Decimate2x(rtc::ArrayView<const float, kBufSize24kHz> src,
rtc::ArrayView<float, kBufSize12kHz> dst) {
// TODO(bugs.webrtc.org/9076): Consider adding anti-aliasing filter.
static_assert(2 * dst.size() == src.size(), "");
for (size_t i = 0; i < dst.size(); ++i) {
dst[i] = src[2 * i];
}
}
float ComputePitchGainThreshold(int candidate_pitch_period,
int pitch_period_ratio,
int initial_pitch_period,
float initial_pitch_gain,
int prev_pitch_period,
float prev_pitch_gain) {
// Map arguments to more compact aliases.
const int& t1 = candidate_pitch_period;
const int& k = pitch_period_ratio;
const int& t0 = initial_pitch_period;
const float& g0 = initial_pitch_gain;
const int& t_prev = prev_pitch_period;
const float& g_prev = prev_pitch_gain;
// Validate input.
RTC_DCHECK_GE(t1, 0);
RTC_DCHECK_GE(k, 2);
RTC_DCHECK_GE(t0, 0);
RTC_DCHECK_GE(t_prev, 0);
// Compute a term that lowers the threshold when |t1| is close to the last
// estimated period |t_prev| - i.e., pitch tracking.
float lower_threshold_term = 0;
if (abs(t1 - t_prev) <= 1) {
// The candidate pitch period is within 1 sample from the previous one.
// Make the candidate at |t1| very easy to be accepted.
lower_threshold_term = g_prev;
} else if (abs(t1 - t_prev) == 2 &&
t0 > kInitialPitchPeriodThresholds[k - 2]) {
// The candidate pitch period is 2 samples far from the previous one and the
// period |t0| (from which |t1| has been derived) is greater than a
// threshold. Make |t1| easy to be accepted.
lower_threshold_term = 0.5f * g_prev;
}
// Set the threshold based on the gain of the initial estimate |t0|. Also
// reduce the chance of false positives caused by a bias towards high
// frequencies (originating from short-term correlations).
float threshold = std::max(0.3f, 0.7f * g0 - lower_threshold_term);
if (static_cast<size_t>(t1) < 3 * kMinPitch24kHz) {
// High frequency.
threshold = std::max(0.4f, 0.85f * g0 - lower_threshold_term);
} else if (static_cast<size_t>(t1) < 2 * kMinPitch24kHz) {
// Even higher frequency.
threshold = std::max(0.5f, 0.9f * g0 - lower_threshold_term);
}
return threshold;
}
void ComputeSlidingFrameSquareEnergies(
rtc::ArrayView<const float, kBufSize24kHz> pitch_buf,
rtc::ArrayView<float, kMaxPitch24kHz + 1> yy_values) {
float yy =
ComputeAutoCorrelationCoeff(pitch_buf, kMaxPitch24kHz, kMaxPitch24kHz);
yy_values[0] = yy;
for (size_t i = 1; i < yy_values.size(); ++i) {
RTC_DCHECK_LE(i, kMaxPitch24kHz + kFrameSize20ms24kHz);
RTC_DCHECK_LE(i, kMaxPitch24kHz);
const float old_coeff = pitch_buf[kMaxPitch24kHz + kFrameSize20ms24kHz - i];
const float new_coeff = pitch_buf[kMaxPitch24kHz - i];
yy -= old_coeff * old_coeff;
yy += new_coeff * new_coeff;
yy = std::max(0.f, yy);
yy_values[i] = yy;
}
}
std::array<size_t, 2> FindBestPitchPeriods(
rtc::ArrayView<const float> auto_corr,
rtc::ArrayView<const float> pitch_buf,
size_t max_pitch_period) {
// Stores a pitch candidate period and strength information.
struct PitchCandidate {
// Pitch period encoded as inverted lag.
size_t period_inverted_lag = 0;
// Pitch strength encoded as a ratio.
float strength_numerator = -1.f;
float strength_denominator = 0.f;
// Compare the strength of two pitch candidates.
bool HasStrongerPitchThan(const PitchCandidate& b) const {
// Comparing the numerator/denominator ratios without using divisions.
return strength_numerator * b.strength_denominator >
b.strength_numerator * strength_denominator;
}
};
RTC_DCHECK_GT(max_pitch_period, auto_corr.size());
RTC_DCHECK_LT(max_pitch_period, pitch_buf.size());
const size_t frame_size = pitch_buf.size() - max_pitch_period;
// TODO(bugs.webrtc.org/9076): Maybe optimize using vectorization.
float yy =
std::inner_product(pitch_buf.begin(), pitch_buf.begin() + frame_size + 1,
pitch_buf.begin(), 1.f);
// Search best and second best pitches by looking at the scaled
// auto-correlation.
PitchCandidate candidate;
PitchCandidate best;
PitchCandidate second_best;
second_best.period_inverted_lag = 1;
for (size_t inv_lag = 0; inv_lag < auto_corr.size(); ++inv_lag) {
// A pitch candidate must have positive correlation.
if (auto_corr[inv_lag] > 0) {
candidate.period_inverted_lag = inv_lag;
candidate.strength_numerator = auto_corr[inv_lag] * auto_corr[inv_lag];
candidate.strength_denominator = yy;
if (candidate.HasStrongerPitchThan(second_best)) {
if (candidate.HasStrongerPitchThan(best)) {
second_best = best;
best = candidate;
} else {
second_best = candidate;
}
}
}
// Update |squared_energy_y| for the next inverted lag.
const float old_coeff = pitch_buf[inv_lag];
const float new_coeff = pitch_buf[inv_lag + frame_size];
yy -= old_coeff * old_coeff;
yy += new_coeff * new_coeff;
yy = std::max(0.f, yy);
}
return {{best.period_inverted_lag, second_best.period_inverted_lag}};
}
size_t RefinePitchPeriod48kHz(
rtc::ArrayView<const float, kBufSize24kHz> pitch_buf,
rtc::ArrayView<const size_t, 2> inv_lags) {
// Compute the auto-correlation terms only for neighbors of the given pitch
// candidates (similar to what is done in ComputePitchAutoCorrelation(), but
// for a few lag values).
std::array<float, kNumInvertedLags24kHz> auto_corr;
auto_corr.fill(0.f); // Zeros become ignored lags in FindBestPitchPeriods().
auto is_neighbor = [](size_t i, size_t j) {
return ((i > j) ? (i - j) : (j - i)) <= 2;
};
for (size_t inv_lag = 0; inv_lag < auto_corr.size(); ++inv_lag) {
if (is_neighbor(inv_lag, inv_lags[0]) || is_neighbor(inv_lag, inv_lags[1]))
auto_corr[inv_lag] =
ComputeAutoCorrelationCoeff(pitch_buf, inv_lag, kMaxPitch24kHz);
}
// Find best pitch at 24 kHz.
const auto pitch_candidates_inv_lags = FindBestPitchPeriods(
{auto_corr.data(), auto_corr.size()},
{pitch_buf.data(), pitch_buf.size()}, kMaxPitch24kHz);
const auto inv_lag = pitch_candidates_inv_lags[0]; // Refine the best.
// Pseudo-interpolation.
return PitchPseudoInterpolationInvLagAutoCorr(inv_lag, auto_corr);
}
PitchInfo CheckLowerPitchPeriodsAndComputePitchGain(
rtc::ArrayView<const float, kBufSize24kHz> pitch_buf,
int initial_pitch_period_48kHz,
PitchInfo prev_pitch_48kHz) {
RTC_DCHECK_LE(kMinPitch48kHz, initial_pitch_period_48kHz);
RTC_DCHECK_LE(initial_pitch_period_48kHz, kMaxPitch48kHz);
// Stores information for a refined pitch candidate.
struct RefinedPitchCandidate {
RefinedPitchCandidate() {}
RefinedPitchCandidate(int period_24kHz, float gain, float xy, float yy)
: period_24kHz(period_24kHz), gain(gain), xy(xy), yy(yy) {}
int period_24kHz;
// Pitch strength information.
float gain;
// Additional pitch strength information used for the final estimation of
// pitch gain.
float xy; // Cross-correlation.
float yy; // Auto-correlation.
};
// Initialize.
std::array<float, kMaxPitch24kHz + 1> yy_values;
ComputeSlidingFrameSquareEnergies(pitch_buf,
{yy_values.data(), yy_values.size()});
const float xx = yy_values[0];
// Helper lambdas.
const auto pitch_gain = [](float xy, float yy, float xx) {
RTC_DCHECK_LE(0.f, xx * yy);
return xy / std::sqrt(1.f + xx * yy);
};
// Initial pitch candidate gain.
RefinedPitchCandidate best_pitch;
best_pitch.period_24kHz = std::min(initial_pitch_period_48kHz / 2,
static_cast<int>(kMaxPitch24kHz - 1));
best_pitch.xy = ComputeAutoCorrelationCoeff(
pitch_buf, GetInvertedLag(best_pitch.period_24kHz), kMaxPitch24kHz);
best_pitch.yy = yy_values[best_pitch.period_24kHz];
best_pitch.gain = pitch_gain(best_pitch.xy, best_pitch.yy, xx);
// Store the initial pitch period information.
const size_t initial_pitch_period = best_pitch.period_24kHz;
const float initial_pitch_gain = best_pitch.gain;
// Given the initial pitch estimation, check lower periods (i.e., harmonics).
const auto alternative_period = [](int period, int k, int n) -> int {
RTC_DCHECK_GT(k, 0);
return (2 * n * period + k) / (2 * k); // Same as round(n*period/k).
};
for (int k = 2; k < static_cast<int>(kSubHarmonicMultipliers.size() + 2);
++k) {
int candidate_pitch_period = alternative_period(initial_pitch_period, k, 1);
if (static_cast<size_t>(candidate_pitch_period) < kMinPitch24kHz) {
break;
}
// When looking at |candidate_pitch_period|, we also look at one of its
// sub-harmonics. |kSubHarmonicMultipliers| is used to know where to look.
// |k| == 2 is a special case since |candidate_pitch_secondary_period| might
// be greater than the maximum pitch period.
int candidate_pitch_secondary_period = alternative_period(
initial_pitch_period, k, kSubHarmonicMultipliers[k - 2]);
RTC_DCHECK_GT(candidate_pitch_secondary_period, 0);
if (k == 2 &&
candidate_pitch_secondary_period > static_cast<int>(kMaxPitch24kHz)) {
candidate_pitch_secondary_period = initial_pitch_period;
}
RTC_DCHECK_NE(candidate_pitch_period, candidate_pitch_secondary_period)
<< "The lower pitch period and the additional sub-harmonic must not "
"coincide.";
// Compute an auto-correlation score for the primary pitch candidate
// |candidate_pitch_period| by also looking at its possible sub-harmonic
// |candidate_pitch_secondary_period|.
float xy_primary_period = ComputeAutoCorrelationCoeff(
pitch_buf, GetInvertedLag(candidate_pitch_period), kMaxPitch24kHz);
float xy_secondary_period = ComputeAutoCorrelationCoeff(
pitch_buf, GetInvertedLag(candidate_pitch_secondary_period),
kMaxPitch24kHz);
float xy = 0.5f * (xy_primary_period + xy_secondary_period);
float yy = 0.5f * (yy_values[candidate_pitch_period] +
yy_values[candidate_pitch_secondary_period]);
float candidate_pitch_gain = pitch_gain(xy, yy, xx);
// Maybe update best period.
float threshold = ComputePitchGainThreshold(
candidate_pitch_period, k, initial_pitch_period, initial_pitch_gain,
prev_pitch_48kHz.period / 2, prev_pitch_48kHz.gain);
if (candidate_pitch_gain > threshold) {
best_pitch = {candidate_pitch_period, candidate_pitch_gain, xy, yy};
}
}
// Final pitch gain and period.
best_pitch.xy = std::max(0.f, best_pitch.xy);
RTC_DCHECK_LE(0.f, best_pitch.yy);
float final_pitch_gain = (best_pitch.yy <= best_pitch.xy)
? 1.f
: best_pitch.xy / (best_pitch.yy + 1.f);
final_pitch_gain = std::min(best_pitch.gain, final_pitch_gain);
int final_pitch_period_48kHz = std::max(
kMinPitch48kHz,
PitchPseudoInterpolationLagPitchBuf(best_pitch.period_24kHz, pitch_buf));
return {final_pitch_period_48kHz, final_pitch_gain};
}
} // namespace rnn_vad
} // namespace webrtc

View File

@ -0,0 +1,77 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_INTERNAL_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_INTERNAL_H_
#include <stddef.h>
#include <array>
#include "api/array_view.h"
#include "modules/audio_processing/agc2/rnn_vad/common.h"
#include "modules/audio_processing/agc2/rnn_vad/pitch_info.h"
namespace webrtc {
namespace rnn_vad {
// Performs 2x decimation without any anti-aliasing filter.
void Decimate2x(rtc::ArrayView<const float, kBufSize24kHz> src,
rtc::ArrayView<float, kBufSize12kHz> dst);
// Computes a gain threshold for a candidate pitch period given the initial and
// the previous pitch period and gain estimates and the pitch period ratio used
// to derive the candidate pitch period from the initial period.
float ComputePitchGainThreshold(int candidate_pitch_period,
int pitch_period_ratio,
int initial_pitch_period,
float initial_pitch_gain,
int prev_pitch_period,
float prev_pitch_gain);
// Computes the sum of squared samples for every sliding frame in the pitch
// buffer. |yy_values| indexes are lags.
//
// The pitch buffer is structured as depicted below:
// |.........|...........|
// a b
// The part on the left, named "a" contains the oldest samples, whereas "b" the
// most recent ones. The size of "a" corresponds to the maximum pitch period,
// that of "b" to the frame size (e.g., 16 ms and 20 ms respectively).
void ComputeSlidingFrameSquareEnergies(
rtc::ArrayView<const float, kBufSize24kHz> pitch_buf,
rtc::ArrayView<float, kMaxPitch24kHz + 1> yy_values);
// Given the auto-correlation coefficients stored according to
// ComputePitchAutoCorrelation() (i.e., using inverted lags), returns the best
// and the second best pitch periods.
std::array<size_t, 2> FindBestPitchPeriods(
rtc::ArrayView<const float> auto_corr,
rtc::ArrayView<const float> pitch_buf,
size_t max_pitch_period);
// Refines the pitch period estimation given the pitch buffer |pitch_buf| and
// the initial pitch period estimation |inv_lags|. Returns an inverted lag at
// 48 kHz.
size_t RefinePitchPeriod48kHz(
rtc::ArrayView<const float, kBufSize24kHz> pitch_buf,
rtc::ArrayView<const size_t, 2> inv_lags);
// Refines the pitch period estimation and compute the pitch gain. Returns the
// refined pitch estimation data at 48 kHz.
PitchInfo CheckLowerPitchPeriodsAndComputePitchGain(
rtc::ArrayView<const float, kBufSize24kHz> pitch_buf,
int initial_pitch_period_48kHz,
PitchInfo prev_pitch_48kHz);
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_INTERNAL_H_

View File

@ -0,0 +1,66 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RING_BUFFER_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RING_BUFFER_H_
#include <array>
#include <cstring>
#include <type_traits>
#include "api/array_view.h"
namespace webrtc {
namespace rnn_vad {
// Ring buffer for N arrays of type T each one with size S.
template <typename T, size_t S, size_t N>
class RingBuffer {
static_assert(S > 0, "");
static_assert(N > 0, "");
static_assert(std::is_arithmetic<T>::value,
"Integral or floating point required.");
public:
RingBuffer() : tail_(0) {}
RingBuffer(const RingBuffer&) = delete;
RingBuffer& operator=(const RingBuffer&) = delete;
~RingBuffer() = default;
// Set the ring buffer values to zero.
void Reset() { buffer_.fill(0); }
// Replace the least recently pushed array in the buffer with |new_values|.
void Push(rtc::ArrayView<const T, S> new_values) {
std::memcpy(buffer_.data() + S * tail_, new_values.data(), S * sizeof(T));
tail_ += 1;
if (tail_ == N)
tail_ = 0;
}
// Return an array view onto the array with a given delay. A view on the last
// and least recently push array is returned when |delay| is 0 and N - 1
// respectively.
rtc::ArrayView<const T, S> GetArrayView(size_t delay) const {
const int delay_int = static_cast<int>(delay);
RTC_DCHECK_LE(0, delay_int);
RTC_DCHECK_LT(delay_int, N);
int offset = tail_ - 1 - delay_int;
if (offset < 0)
offset += N;
return {buffer_.data() + S * offset, S};
}
private:
int tail_; // Index of the least recently pushed sub-array.
std::array<T, S * N> buffer_{};
};
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RING_BUFFER_H_

View File

@ -0,0 +1,425 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/rnn_vad/rnn.h"
// Defines WEBRTC_ARCH_X86_FAMILY, used below.
#include "rtc_base/system/arch.h"
#if defined(WEBRTC_HAS_NEON)
#include <arm_neon.h>
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
#include <emmintrin.h>
#endif
#include <algorithm>
#include <array>
#include <cmath>
#include <numeric>
#include "rtc_base/checks.h"
#include "rtc_base/logging.h"
#include "third_party/rnnoise/src/rnn_activations.h"
#include "third_party/rnnoise/src/rnn_vad_weights.h"
namespace webrtc {
namespace rnn_vad {
namespace {
using rnnoise::kWeightsScale;
using rnnoise::kInputLayerInputSize;
static_assert(kFeatureVectorSize == kInputLayerInputSize, "");
using rnnoise::kInputDenseBias;
using rnnoise::kInputDenseWeights;
using rnnoise::kInputLayerOutputSize;
static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
"Increase kFullyConnectedLayersMaxUnits.");
using rnnoise::kHiddenGruBias;
using rnnoise::kHiddenGruRecurrentWeights;
using rnnoise::kHiddenGruWeights;
using rnnoise::kHiddenLayerOutputSize;
static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits,
"Increase kRecurrentLayersMaxUnits.");
using rnnoise::kOutputDenseBias;
using rnnoise::kOutputDenseWeights;
using rnnoise::kOutputLayerOutputSize;
static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
"Increase kFullyConnectedLayersMaxUnits.");
using rnnoise::SigmoidApproximated;
using rnnoise::TansigApproximated;
inline float RectifiedLinearUnit(float x) {
return x < 0.f ? 0.f : x;
}
std::vector<float> GetScaledParams(rtc::ArrayView<const int8_t> params) {
std::vector<float> scaled_params(params.size());
std::transform(params.begin(), params.end(), scaled_params.begin(),
[](int8_t x) -> float {
return rnnoise::kWeightsScale * static_cast<float>(x);
});
return scaled_params;
}
// TODO(bugs.chromium.org/10480): Hard-code optimized layout and remove this
// function to improve setup time.
// Casts and scales |weights| and re-arranges the layout.
std::vector<float> GetPreprocessedFcWeights(
rtc::ArrayView<const int8_t> weights,
size_t output_size) {
if (output_size == 1) {
return GetScaledParams(weights);
}
// Transpose, scale and cast.
const size_t input_size = rtc::CheckedDivExact(weights.size(), output_size);
std::vector<float> w(weights.size());
for (size_t o = 0; o < output_size; ++o) {
for (size_t i = 0; i < input_size; ++i) {
w[o * input_size + i] = rnnoise::kWeightsScale *
static_cast<float>(weights[i * output_size + o]);
}
}
return w;
}
constexpr size_t kNumGruGates = 3; // Update, reset, output.
// TODO(bugs.chromium.org/10480): Hard-coded optimized layout and remove this
// function to improve setup time.
// Casts and scales |tensor_src| for a GRU layer and re-arranges the layout.
// It works both for weights, recurrent weights and bias.
std::vector<float> GetPreprocessedGruTensor(
rtc::ArrayView<const int8_t> tensor_src,
size_t output_size) {
// Transpose, cast and scale.
// |n| is the size of the first dimension of the 3-dim tensor |weights|.
const size_t n =
rtc::CheckedDivExact(tensor_src.size(), output_size * kNumGruGates);
const size_t stride_src = kNumGruGates * output_size;
const size_t stride_dst = n * output_size;
std::vector<float> tensor_dst(tensor_src.size());
for (size_t g = 0; g < kNumGruGates; ++g) {
for (size_t o = 0; o < output_size; ++o) {
for (size_t i = 0; i < n; ++i) {
tensor_dst[g * stride_dst + o * n + i] =
rnnoise::kWeightsScale *
static_cast<float>(
tensor_src[i * stride_src + g * output_size + o]);
}
}
}
return tensor_dst;
}
void ComputeGruUpdateResetGates(size_t input_size,
size_t output_size,
rtc::ArrayView<const float> weights,
rtc::ArrayView<const float> recurrent_weights,
rtc::ArrayView<const float> bias,
rtc::ArrayView<const float> input,
rtc::ArrayView<const float> state,
rtc::ArrayView<float> gate) {
for (size_t o = 0; o < output_size; ++o) {
gate[o] = bias[o];
for (size_t i = 0; i < input_size; ++i) {
gate[o] += input[i] * weights[o * input_size + i];
}
for (size_t s = 0; s < output_size; ++s) {
gate[o] += state[s] * recurrent_weights[o * output_size + s];
}
gate[o] = SigmoidApproximated(gate[o]);
}
}
void ComputeGruOutputGate(size_t input_size,
size_t output_size,
rtc::ArrayView<const float> weights,
rtc::ArrayView<const float> recurrent_weights,
rtc::ArrayView<const float> bias,
rtc::ArrayView<const float> input,
rtc::ArrayView<const float> state,
rtc::ArrayView<const float> reset,
rtc::ArrayView<float> gate) {
for (size_t o = 0; o < output_size; ++o) {
gate[o] = bias[o];
for (size_t i = 0; i < input_size; ++i) {
gate[o] += input[i] * weights[o * input_size + i];
}
for (size_t s = 0; s < output_size; ++s) {
gate[o] += state[s] * recurrent_weights[o * output_size + s] * reset[s];
}
gate[o] = RectifiedLinearUnit(gate[o]);
}
}
// Gated recurrent unit (GRU) layer un-optimized implementation.
void ComputeGruLayerOutput(size_t input_size,
size_t output_size,
rtc::ArrayView<const float> input,
rtc::ArrayView<const float> weights,
rtc::ArrayView<const float> recurrent_weights,
rtc::ArrayView<const float> bias,
rtc::ArrayView<float> state) {
RTC_DCHECK_EQ(input_size, input.size());
// Stride and offset used to read parameter arrays.
const size_t stride_in = input_size * output_size;
const size_t stride_out = output_size * output_size;
// Update gate.
std::array<float, kRecurrentLayersMaxUnits> update;
ComputeGruUpdateResetGates(
input_size, output_size, weights.subview(0, stride_in),
recurrent_weights.subview(0, stride_out), bias.subview(0, output_size),
input, state, update);
// Reset gate.
std::array<float, kRecurrentLayersMaxUnits> reset;
ComputeGruUpdateResetGates(
input_size, output_size, weights.subview(stride_in, stride_in),
recurrent_weights.subview(stride_out, stride_out),
bias.subview(output_size, output_size), input, state, reset);
// Output gate.
std::array<float, kRecurrentLayersMaxUnits> output;
ComputeGruOutputGate(
input_size, output_size, weights.subview(2 * stride_in, stride_in),
recurrent_weights.subview(2 * stride_out, stride_out),
bias.subview(2 * output_size, output_size), input, state, reset, output);
// Update output through the update gates and update the state.
for (size_t o = 0; o < output_size; ++o) {
output[o] = update[o] * state[o] + (1.f - update[o]) * output[o];
state[o] = output[o];
}
}
// Fully connected layer un-optimized implementation.
void ComputeFullyConnectedLayerOutput(
size_t input_size,
size_t output_size,
rtc::ArrayView<const float> input,
rtc::ArrayView<const float> bias,
rtc::ArrayView<const float> weights,
rtc::FunctionView<float(float)> activation_function,
rtc::ArrayView<float> output) {
RTC_DCHECK_EQ(input.size(), input_size);
RTC_DCHECK_EQ(bias.size(), output_size);
RTC_DCHECK_EQ(weights.size(), input_size * output_size);
for (size_t o = 0; o < output_size; ++o) {
output[o] = bias[o];
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
// |weights_| change the performance across different platforms.
for (size_t i = 0; i < input_size; ++i) {
output[o] += input[i] * weights[o * input_size + i];
}
output[o] = activation_function(output[o]);
}
}
#if defined(WEBRTC_ARCH_X86_FAMILY)
// Fully connected layer SSE2 implementation.
void ComputeFullyConnectedLayerOutputSse2(
size_t input_size,
size_t output_size,
rtc::ArrayView<const float> input,
rtc::ArrayView<const float> bias,
rtc::ArrayView<const float> weights,
rtc::FunctionView<float(float)> activation_function,
rtc::ArrayView<float> output) {
RTC_DCHECK_EQ(input.size(), input_size);
RTC_DCHECK_EQ(bias.size(), output_size);
RTC_DCHECK_EQ(weights.size(), input_size * output_size);
const size_t input_size_by_4 = input_size >> 2;
const size_t offset = input_size & ~3;
__m128 sum_wx_128;
const float* v = reinterpret_cast<const float*>(&sum_wx_128);
for (size_t o = 0; o < output_size; ++o) {
// Perform 128 bit vector operations.
sum_wx_128 = _mm_set1_ps(0);
const float* x_p = input.data();
const float* w_p = weights.data() + o * input_size;
for (size_t i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
sum_wx_128 = _mm_add_ps(sum_wx_128,
_mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
}
// Perform non-vector operations for any remaining items, sum up bias term
// and results from the vectorized code, and apply the activation function.
output[o] = activation_function(
std::inner_product(input.begin() + offset, input.end(),
weights.begin() + o * input_size + offset,
bias[o] + v[0] + v[1] + v[2] + v[3]));
}
}
#endif
} // namespace
FullyConnectedLayer::FullyConnectedLayer(
const size_t input_size,
const size_t output_size,
const rtc::ArrayView<const int8_t> bias,
const rtc::ArrayView<const int8_t> weights,
rtc::FunctionView<float(float)> activation_function,
Optimization optimization)
: input_size_(input_size),
output_size_(output_size),
bias_(GetScaledParams(bias)),
weights_(GetPreprocessedFcWeights(weights, output_size)),
activation_function_(activation_function),
optimization_(optimization) {
RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
<< "Static over-allocation of fully-connected layers output vectors is "
"not sufficient.";
RTC_DCHECK_EQ(output_size_, bias_.size())
<< "Mismatching output size and bias terms array size.";
RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
<< "Mismatching input-output size and weight coefficients array size.";
}
FullyConnectedLayer::~FullyConnectedLayer() = default;
rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
return rtc::ArrayView<const float>(output_.data(), output_size_);
}
void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
switch (optimization_) {
#if defined(WEBRTC_ARCH_X86_FAMILY)
case Optimization::kSse2:
ComputeFullyConnectedLayerOutputSse2(input_size_, output_size_, input,
bias_, weights_,
activation_function_, output_);
break;
#endif
#if defined(WEBRTC_HAS_NEON)
case Optimization::kNeon:
// TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
weights_, activation_function_, output_);
break;
#endif
default:
ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
weights_, activation_function_, output_);
}
}
GatedRecurrentLayer::GatedRecurrentLayer(
const size_t input_size,
const size_t output_size,
const rtc::ArrayView<const int8_t> bias,
const rtc::ArrayView<const int8_t> weights,
const rtc::ArrayView<const int8_t> recurrent_weights,
Optimization optimization)
: input_size_(input_size),
output_size_(output_size),
bias_(GetPreprocessedGruTensor(bias, output_size)),
weights_(GetPreprocessedGruTensor(weights, output_size)),
recurrent_weights_(
GetPreprocessedGruTensor(recurrent_weights, output_size)),
optimization_(optimization) {
RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
<< "Static over-allocation of recurrent layers state vectors is not "
"sufficient.";
RTC_DCHECK_EQ(kNumGruGates * output_size_, bias_.size())
<< "Mismatching output size and bias terms array size.";
RTC_DCHECK_EQ(kNumGruGates * input_size_ * output_size_, weights_.size())
<< "Mismatching input-output size and weight coefficients array size.";
RTC_DCHECK_EQ(kNumGruGates * output_size_ * output_size_,
recurrent_weights_.size())
<< "Mismatching input-output size and recurrent weight coefficients array"
" size.";
Reset();
}
GatedRecurrentLayer::~GatedRecurrentLayer() = default;
rtc::ArrayView<const float> GatedRecurrentLayer::GetOutput() const {
return rtc::ArrayView<const float>(state_.data(), output_size_);
}
void GatedRecurrentLayer::Reset() {
state_.fill(0.f);
}
void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
switch (optimization_) {
#if defined(WEBRTC_ARCH_X86_FAMILY)
case Optimization::kSse2:
// TODO(bugs.chromium.org/10480): Handle Optimization::kSse2.
ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
recurrent_weights_, bias_, state_);
break;
#endif
#if defined(WEBRTC_HAS_NEON)
case Optimization::kNeon:
// TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
recurrent_weights_, bias_, state_);
break;
#endif
default:
ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
recurrent_weights_, bias_, state_);
}
}
RnnBasedVad::RnnBasedVad()
: input_layer_(kInputLayerInputSize,
kInputLayerOutputSize,
kInputDenseBias,
kInputDenseWeights,
TansigApproximated,
DetectOptimization()),
hidden_layer_(kInputLayerOutputSize,
kHiddenLayerOutputSize,
kHiddenGruBias,
kHiddenGruWeights,
kHiddenGruRecurrentWeights,
DetectOptimization()),
output_layer_(kHiddenLayerOutputSize,
kOutputLayerOutputSize,
kOutputDenseBias,
kOutputDenseWeights,
SigmoidApproximated,
DetectOptimization()) {
// Input-output chaining size checks.
RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
<< "The input and the hidden layers sizes do not match.";
RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size())
<< "The hidden and the output layers sizes do not match.";
}
RnnBasedVad::~RnnBasedVad() = default;
void RnnBasedVad::Reset() {
hidden_layer_.Reset();
}
float RnnBasedVad::ComputeVadProbability(
rtc::ArrayView<const float, kFeatureVectorSize> feature_vector,
bool is_silence) {
if (is_silence) {
Reset();
return 0.f;
}
input_layer_.ComputeOutput(feature_vector);
hidden_layer_.ComputeOutput(input_layer_.GetOutput());
output_layer_.ComputeOutput(hidden_layer_.GetOutput());
const auto vad_output = output_layer_.GetOutput();
return vad_output[0];
}
} // namespace rnn_vad
} // namespace webrtc

View File

@ -0,0 +1,126 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RNN_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RNN_H_
#include <stddef.h>
#include <sys/types.h>
#include <array>
#include <vector>
#include "api/array_view.h"
#include "api/function_view.h"
#include "modules/audio_processing/agc2/rnn_vad/common.h"
#include "rtc_base/system/arch.h"
namespace webrtc {
namespace rnn_vad {
// Maximum number of units for a fully-connected layer. This value is used to
// over-allocate space for fully-connected layers output vectors (implemented as
// std::array). The value should equal the number of units of the largest
// fully-connected layer.
constexpr size_t kFullyConnectedLayersMaxUnits = 24;
// Maximum number of units for a recurrent layer. This value is used to
// over-allocate space for recurrent layers state vectors (implemented as
// std::array). The value should equal the number of units of the largest
// recurrent layer.
constexpr size_t kRecurrentLayersMaxUnits = 24;
// Fully-connected layer.
class FullyConnectedLayer {
public:
FullyConnectedLayer(size_t input_size,
size_t output_size,
rtc::ArrayView<const int8_t> bias,
rtc::ArrayView<const int8_t> weights,
rtc::FunctionView<float(float)> activation_function,
Optimization optimization);
FullyConnectedLayer(const FullyConnectedLayer&) = delete;
FullyConnectedLayer& operator=(const FullyConnectedLayer&) = delete;
~FullyConnectedLayer();
size_t input_size() const { return input_size_; }
size_t output_size() const { return output_size_; }
Optimization optimization() const { return optimization_; }
rtc::ArrayView<const float> GetOutput() const;
// Computes the fully-connected layer output.
void ComputeOutput(rtc::ArrayView<const float> input);
private:
const size_t input_size_;
const size_t output_size_;
const std::vector<float> bias_;
const std::vector<float> weights_;
rtc::FunctionView<float(float)> activation_function_;
// The output vector of a recurrent layer has length equal to |output_size_|.
// However, for efficiency, over-allocation is used.
std::array<float, kFullyConnectedLayersMaxUnits> output_;
const Optimization optimization_;
};
// Recurrent layer with gated recurrent units (GRUs) with sigmoid and ReLU as
// activation functions for the update/reset and output gates respectively.
class GatedRecurrentLayer {
public:
GatedRecurrentLayer(size_t input_size,
size_t output_size,
rtc::ArrayView<const int8_t> bias,
rtc::ArrayView<const int8_t> weights,
rtc::ArrayView<const int8_t> recurrent_weights,
Optimization optimization);
GatedRecurrentLayer(const GatedRecurrentLayer&) = delete;
GatedRecurrentLayer& operator=(const GatedRecurrentLayer&) = delete;
~GatedRecurrentLayer();
size_t input_size() const { return input_size_; }
size_t output_size() const { return output_size_; }
Optimization optimization() const { return optimization_; }
rtc::ArrayView<const float> GetOutput() const;
void Reset();
// Computes the recurrent layer output and updates the status.
void ComputeOutput(rtc::ArrayView<const float> input);
private:
const size_t input_size_;
const size_t output_size_;
const std::vector<float> bias_;
const std::vector<float> weights_;
const std::vector<float> recurrent_weights_;
// The state vector of a recurrent layer has length equal to |output_size_|.
// However, to avoid dynamic allocation, over-allocation is used.
std::array<float, kRecurrentLayersMaxUnits> state_;
const Optimization optimization_;
};
// Recurrent network based VAD.
class RnnBasedVad {
public:
RnnBasedVad();
RnnBasedVad(const RnnBasedVad&) = delete;
RnnBasedVad& operator=(const RnnBasedVad&) = delete;
~RnnBasedVad();
void Reset();
// Compute and returns the probability of voice (range: [0.0, 1.0]).
float ComputeVadProbability(
rtc::ArrayView<const float, kFeatureVectorSize> feature_vector,
bool is_silence);
private:
FullyConnectedLayer input_layer_;
GatedRecurrentLayer hidden_layer_;
FullyConnectedLayer output_layer_;
};
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RNN_H_

View File

@ -0,0 +1,120 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <array>
#include <string>
#include <vector>
#include "absl/flags/flag.h"
#include "absl/flags/parse.h"
#include "common_audio/resampler/push_sinc_resampler.h"
#include "common_audio/wav_file.h"
#include "modules/audio_processing/agc2/rnn_vad/common.h"
#include "modules/audio_processing/agc2/rnn_vad/features_extraction.h"
#include "modules/audio_processing/agc2/rnn_vad/rnn.h"
#include "rtc_base/logging.h"
ABSL_FLAG(std::string, i, "", "Path to the input wav file");
ABSL_FLAG(std::string, f, "", "Path to the output features file");
ABSL_FLAG(std::string, o, "", "Path to the output VAD probabilities file");
namespace webrtc {
namespace rnn_vad {
namespace test {
int main(int argc, char* argv[]) {
absl::ParseCommandLine(argc, argv);
rtc::LogMessage::LogToDebug(rtc::LS_INFO);
// Open wav input file and check properties.
const std::string input_wav_file = absl::GetFlag(FLAGS_i);
WavReader wav_reader(input_wav_file);
if (wav_reader.num_channels() != 1) {
RTC_LOG(LS_ERROR) << "Only mono wav files are supported";
return 1;
}
if (wav_reader.sample_rate() % 100 != 0) {
RTC_LOG(LS_ERROR) << "The sample rate rate must allow 10 ms frames.";
return 1;
}
RTC_LOG(LS_INFO) << "Input sample rate: " << wav_reader.sample_rate();
// Init output files.
const std::string output_vad_probs_file = absl::GetFlag(FLAGS_o);
FILE* vad_probs_file = fopen(output_vad_probs_file.c_str(), "wb");
FILE* features_file = nullptr;
const std::string output_feature_file = absl::GetFlag(FLAGS_f);
if (!output_feature_file.empty()) {
features_file = fopen(output_feature_file.c_str(), "wb");
}
// Initialize.
const size_t frame_size_10ms =
rtc::CheckedDivExact(wav_reader.sample_rate(), 100);
std::vector<float> samples_10ms;
samples_10ms.resize(frame_size_10ms);
std::array<float, kFrameSize10ms24kHz> samples_10ms_24kHz;
PushSincResampler resampler(frame_size_10ms, kFrameSize10ms24kHz);
FeaturesExtractor features_extractor;
std::array<float, kFeatureVectorSize> feature_vector;
RnnBasedVad rnn_vad;
// Compute VAD probabilities.
while (true) {
// Read frame at the input sample rate.
const auto read_samples =
wav_reader.ReadSamples(frame_size_10ms, samples_10ms.data());
if (read_samples < frame_size_10ms) {
break; // EOF.
}
// Resample input.
resampler.Resample(samples_10ms.data(), samples_10ms.size(),
samples_10ms_24kHz.data(), samples_10ms_24kHz.size());
// Extract features and feed the RNN.
bool is_silence = features_extractor.CheckSilenceComputeFeatures(
samples_10ms_24kHz, feature_vector);
float vad_probability =
rnn_vad.ComputeVadProbability(feature_vector, is_silence);
// Write voice probability.
RTC_DCHECK_GE(vad_probability, 0.f);
RTC_DCHECK_GE(1.f, vad_probability);
fwrite(&vad_probability, sizeof(float), 1, vad_probs_file);
// Write features.
if (features_file) {
const float float_is_silence = is_silence ? 1.f : 0.f;
fwrite(&float_is_silence, sizeof(float), 1, features_file);
if (is_silence) {
// Do not write uninitialized values.
feature_vector.fill(0.f);
}
fwrite(feature_vector.data(), sizeof(float), kFeatureVectorSize,
features_file);
}
}
// Close output file(s).
fclose(vad_probs_file);
RTC_LOG(LS_INFO) << "VAD probabilities written to " << output_vad_probs_file;
if (features_file) {
fclose(features_file);
RTC_LOG(LS_INFO) << "features written to " << output_feature_file;
}
return 0;
}
} // namespace test
} // namespace rnn_vad
} // namespace webrtc
int main(int argc, char* argv[]) {
return webrtc::rnn_vad::test::main(argc, argv);
}

View File

@ -0,0 +1,79 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SEQUENCE_BUFFER_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SEQUENCE_BUFFER_H_
#include <algorithm>
#include <cstring>
#include <type_traits>
#include <vector>
#include "api/array_view.h"
#include "rtc_base/checks.h"
namespace webrtc {
namespace rnn_vad {
// Linear buffer implementation to (i) push fixed size chunks of sequential data
// and (ii) view contiguous parts of the buffer. The buffer and the pushed
// chunks have size S and N respectively. For instance, when S = 2N the first
// half of the sequence buffer is replaced with its second half, and the new N
// values are written at the end of the buffer.
// The class also provides a view on the most recent M values, where 0 < M <= S
// and by default M = N.
template <typename T, size_t S, size_t N, size_t M = N>
class SequenceBuffer {
static_assert(N <= S,
"The new chunk size cannot be larger than the sequence buffer "
"size.");
static_assert(std::is_arithmetic<T>::value,
"Integral or floating point required.");
public:
SequenceBuffer() : buffer_(S) {
RTC_DCHECK_EQ(S, buffer_.size());
Reset();
}
SequenceBuffer(const SequenceBuffer&) = delete;
SequenceBuffer& operator=(const SequenceBuffer&) = delete;
~SequenceBuffer() = default;
size_t size() const { return S; }
size_t chunks_size() const { return N; }
// Sets the sequence buffer values to zero.
void Reset() { std::fill(buffer_.begin(), buffer_.end(), 0); }
// Returns a view on the whole buffer.
rtc::ArrayView<const T, S> GetBufferView() const {
return {buffer_.data(), S};
}
// Returns a view on the M most recent values of the buffer.
rtc::ArrayView<const T, M> GetMostRecentValuesView() const {
static_assert(M <= S,
"The number of most recent values cannot be larger than the "
"sequence buffer size.");
return {buffer_.data() + S - M, M};
}
// Shifts left the buffer by N items and add new N items at the end.
void Push(rtc::ArrayView<const T, N> new_values) {
// Make space for the new values.
if (S > N)
std::memmove(buffer_.data(), buffer_.data() + N, (S - N) * sizeof(T));
// Copy the new values at the end of the buffer.
std::memcpy(buffer_.data() + S - N, new_values.data(), N * sizeof(T));
}
private:
std::vector<T> buffer_;
};
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SEQUENCE_BUFFER_H_

View File

@ -0,0 +1,213 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/rnn_vad/spectral_features.h"
#include <algorithm>
#include <cmath>
#include <limits>
#include <numeric>
#include "rtc_base/checks.h"
namespace webrtc {
namespace rnn_vad {
namespace {
constexpr float kSilenceThreshold = 0.04f;
// Computes the new cepstral difference stats and pushes them into the passed
// symmetric matrix buffer.
void UpdateCepstralDifferenceStats(
rtc::ArrayView<const float, kNumBands> new_cepstral_coeffs,
const RingBuffer<float, kNumBands, kCepstralCoeffsHistorySize>& ring_buf,
SymmetricMatrixBuffer<float, kCepstralCoeffsHistorySize>* sym_matrix_buf) {
RTC_DCHECK(sym_matrix_buf);
// Compute the new cepstral distance stats.
std::array<float, kCepstralCoeffsHistorySize - 1> distances;
for (size_t i = 0; i < kCepstralCoeffsHistorySize - 1; ++i) {
const size_t delay = i + 1;
auto old_cepstral_coeffs = ring_buf.GetArrayView(delay);
distances[i] = 0.f;
for (size_t k = 0; k < kNumBands; ++k) {
const float c = new_cepstral_coeffs[k] - old_cepstral_coeffs[k];
distances[i] += c * c;
}
}
// Push the new spectral distance stats into the symmetric matrix buffer.
sym_matrix_buf->Push(distances);
}
// Computes the first half of the Vorbis window.
std::array<float, kFrameSize20ms24kHz / 2> ComputeScaledHalfVorbisWindow(
float scaling = 1.f) {
constexpr size_t kHalfSize = kFrameSize20ms24kHz / 2;
std::array<float, kHalfSize> half_window{};
for (size_t i = 0; i < kHalfSize; ++i) {
half_window[i] =
scaling *
std::sin(0.5 * kPi * std::sin(0.5 * kPi * (i + 0.5) / kHalfSize) *
std::sin(0.5 * kPi * (i + 0.5) / kHalfSize));
}
return half_window;
}
// Computes the forward FFT on a 20 ms frame to which a given window function is
// applied. The Fourier coefficient corresponding to the Nyquist frequency is
// set to zero (it is never used and this allows to simplify the code).
void ComputeWindowedForwardFft(
rtc::ArrayView<const float, kFrameSize20ms24kHz> frame,
const std::array<float, kFrameSize20ms24kHz / 2>& half_window,
Pffft::FloatBuffer* fft_input_buffer,
Pffft::FloatBuffer* fft_output_buffer,
Pffft* fft) {
RTC_DCHECK_EQ(frame.size(), 2 * half_window.size());
// Apply windowing.
auto in = fft_input_buffer->GetView();
for (size_t i = 0, j = kFrameSize20ms24kHz - 1; i < half_window.size();
++i, --j) {
in[i] = frame[i] * half_window[i];
in[j] = frame[j] * half_window[i];
}
fft->ForwardTransform(*fft_input_buffer, fft_output_buffer, /*ordered=*/true);
// Set the Nyquist frequency coefficient to zero.
auto out = fft_output_buffer->GetView();
out[1] = 0.f;
}
} // namespace
SpectralFeaturesExtractor::SpectralFeaturesExtractor()
: half_window_(ComputeScaledHalfVorbisWindow(
1.f / static_cast<float>(kFrameSize20ms24kHz))),
fft_(kFrameSize20ms24kHz, Pffft::FftType::kReal),
fft_buffer_(fft_.CreateBuffer()),
reference_frame_fft_(fft_.CreateBuffer()),
lagged_frame_fft_(fft_.CreateBuffer()),
dct_table_(ComputeDctTable()) {}
SpectralFeaturesExtractor::~SpectralFeaturesExtractor() = default;
void SpectralFeaturesExtractor::Reset() {
cepstral_coeffs_ring_buf_.Reset();
cepstral_diffs_buf_.Reset();
}
bool SpectralFeaturesExtractor::CheckSilenceComputeFeatures(
rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame,
rtc::ArrayView<const float, kFrameSize20ms24kHz> lagged_frame,
rtc::ArrayView<float, kNumBands - kNumLowerBands> higher_bands_cepstrum,
rtc::ArrayView<float, kNumLowerBands> average,
rtc::ArrayView<float, kNumLowerBands> first_derivative,
rtc::ArrayView<float, kNumLowerBands> second_derivative,
rtc::ArrayView<float, kNumLowerBands> bands_cross_corr,
float* variability) {
// Compute the Opus band energies for the reference frame.
ComputeWindowedForwardFft(reference_frame, half_window_, fft_buffer_.get(),
reference_frame_fft_.get(), &fft_);
spectral_correlator_.ComputeAutoCorrelation(
reference_frame_fft_->GetConstView(), reference_frame_bands_energy_);
// Check if the reference frame has silence.
const float tot_energy =
std::accumulate(reference_frame_bands_energy_.begin(),
reference_frame_bands_energy_.end(), 0.f);
if (tot_energy < kSilenceThreshold) {
return true;
}
// Compute the Opus band energies for the lagged frame.
ComputeWindowedForwardFft(lagged_frame, half_window_, fft_buffer_.get(),
lagged_frame_fft_.get(), &fft_);
spectral_correlator_.ComputeAutoCorrelation(lagged_frame_fft_->GetConstView(),
lagged_frame_bands_energy_);
// Log of the band energies for the reference frame.
std::array<float, kNumBands> log_bands_energy;
ComputeSmoothedLogMagnitudeSpectrum(reference_frame_bands_energy_,
log_bands_energy);
// Reference frame cepstrum.
std::array<float, kNumBands> cepstrum;
ComputeDct(log_bands_energy, dct_table_, cepstrum);
// Ad-hoc correction terms for the first two cepstral coefficients.
cepstrum[0] -= 12.f;
cepstrum[1] -= 4.f;
// Update the ring buffer and the cepstral difference stats.
cepstral_coeffs_ring_buf_.Push(cepstrum);
UpdateCepstralDifferenceStats(cepstrum, cepstral_coeffs_ring_buf_,
&cepstral_diffs_buf_);
// Write the higher bands cepstral coefficients.
RTC_DCHECK_EQ(cepstrum.size() - kNumLowerBands, higher_bands_cepstrum.size());
std::copy(cepstrum.begin() + kNumLowerBands, cepstrum.end(),
higher_bands_cepstrum.begin());
// Compute and write remaining features.
ComputeAvgAndDerivatives(average, first_derivative, second_derivative);
ComputeNormalizedCepstralCorrelation(bands_cross_corr);
RTC_DCHECK(variability);
*variability = ComputeVariability();
return false;
}
void SpectralFeaturesExtractor::ComputeAvgAndDerivatives(
rtc::ArrayView<float, kNumLowerBands> average,
rtc::ArrayView<float, kNumLowerBands> first_derivative,
rtc::ArrayView<float, kNumLowerBands> second_derivative) const {
auto curr = cepstral_coeffs_ring_buf_.GetArrayView(0);
auto prev1 = cepstral_coeffs_ring_buf_.GetArrayView(1);
auto prev2 = cepstral_coeffs_ring_buf_.GetArrayView(2);
RTC_DCHECK_EQ(average.size(), first_derivative.size());
RTC_DCHECK_EQ(first_derivative.size(), second_derivative.size());
RTC_DCHECK_LE(average.size(), curr.size());
for (size_t i = 0; i < average.size(); ++i) {
// Average, kernel: [1, 1, 1].
average[i] = curr[i] + prev1[i] + prev2[i];
// First derivative, kernel: [1, 0, - 1].
first_derivative[i] = curr[i] - prev2[i];
// Second derivative, Laplacian kernel: [1, -2, 1].
second_derivative[i] = curr[i] - 2 * prev1[i] + prev2[i];
}
}
void SpectralFeaturesExtractor::ComputeNormalizedCepstralCorrelation(
rtc::ArrayView<float, kNumLowerBands> bands_cross_corr) {
spectral_correlator_.ComputeCrossCorrelation(
reference_frame_fft_->GetConstView(), lagged_frame_fft_->GetConstView(),
bands_cross_corr_);
// Normalize.
for (size_t i = 0; i < bands_cross_corr_.size(); ++i) {
bands_cross_corr_[i] =
bands_cross_corr_[i] /
std::sqrt(0.001f + reference_frame_bands_energy_[i] *
lagged_frame_bands_energy_[i]);
}
// Cepstrum.
ComputeDct(bands_cross_corr_, dct_table_, bands_cross_corr);
// Ad-hoc correction terms for the first two cepstral coefficients.
bands_cross_corr[0] -= 1.3f;
bands_cross_corr[1] -= 0.9f;
}
float SpectralFeaturesExtractor::ComputeVariability() const {
// Compute cepstral variability score.
float variability = 0.f;
for (size_t delay1 = 0; delay1 < kCepstralCoeffsHistorySize; ++delay1) {
float min_dist = std::numeric_limits<float>::max();
for (size_t delay2 = 0; delay2 < kCepstralCoeffsHistorySize; ++delay2) {
if (delay1 == delay2) // The distance would be 0.
continue;
min_dist =
std::min(min_dist, cepstral_diffs_buf_.GetValue(delay1, delay2));
}
variability += min_dist;
}
// Normalize (based on training set stats).
// TODO(bugs.webrtc.org/10480): Isolate normalization from feature extraction.
return variability / kCepstralCoeffsHistorySize - 2.1f;
}
} // namespace rnn_vad
} // namespace webrtc

View File

@ -0,0 +1,79 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SPECTRAL_FEATURES_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SPECTRAL_FEATURES_H_
#include <array>
#include <cstddef>
#include <memory>
#include <vector>
#include "api/array_view.h"
#include "modules/audio_processing/agc2/rnn_vad/common.h"
#include "modules/audio_processing/agc2/rnn_vad/ring_buffer.h"
#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
#include "modules/audio_processing/agc2/rnn_vad/symmetric_matrix_buffer.h"
#include "modules/audio_processing/utility/pffft_wrapper.h"
namespace webrtc {
namespace rnn_vad {
// Class to compute spectral features.
class SpectralFeaturesExtractor {
public:
SpectralFeaturesExtractor();
SpectralFeaturesExtractor(const SpectralFeaturesExtractor&) = delete;
SpectralFeaturesExtractor& operator=(const SpectralFeaturesExtractor&) =
delete;
~SpectralFeaturesExtractor();
// Resets the internal state of the feature extractor.
void Reset();
// Analyzes a pair of reference and lagged frames from the pitch buffer,
// detects silence and computes features. If silence is detected, the output
// is neither computed nor written.
bool CheckSilenceComputeFeatures(
rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame,
rtc::ArrayView<const float, kFrameSize20ms24kHz> lagged_frame,
rtc::ArrayView<float, kNumBands - kNumLowerBands> higher_bands_cepstrum,
rtc::ArrayView<float, kNumLowerBands> average,
rtc::ArrayView<float, kNumLowerBands> first_derivative,
rtc::ArrayView<float, kNumLowerBands> second_derivative,
rtc::ArrayView<float, kNumLowerBands> bands_cross_corr,
float* variability);
private:
void ComputeAvgAndDerivatives(
rtc::ArrayView<float, kNumLowerBands> average,
rtc::ArrayView<float, kNumLowerBands> first_derivative,
rtc::ArrayView<float, kNumLowerBands> second_derivative) const;
void ComputeNormalizedCepstralCorrelation(
rtc::ArrayView<float, kNumLowerBands> bands_cross_corr);
float ComputeVariability() const;
const std::array<float, kFrameSize20ms24kHz / 2> half_window_;
Pffft fft_;
std::unique_ptr<Pffft::FloatBuffer> fft_buffer_;
std::unique_ptr<Pffft::FloatBuffer> reference_frame_fft_;
std::unique_ptr<Pffft::FloatBuffer> lagged_frame_fft_;
SpectralCorrelator spectral_correlator_;
std::array<float, kOpusBands24kHz> reference_frame_bands_energy_;
std::array<float, kOpusBands24kHz> lagged_frame_bands_energy_;
std::array<float, kOpusBands24kHz> bands_cross_corr_;
const std::array<float, kNumBands * kNumBands> dct_table_;
RingBuffer<float, kNumBands, kCepstralCoeffsHistorySize>
cepstral_coeffs_ring_buf_;
SymmetricMatrixBuffer<float, kCepstralCoeffsHistorySize> cepstral_diffs_buf_;
};
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SPECTRAL_FEATURES_H_

View File

@ -0,0 +1,187 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
#include <algorithm>
#include <cmath>
#include <cstddef>
#include "rtc_base/checks.h"
namespace webrtc {
namespace rnn_vad {
namespace {
// Weights for each FFT coefficient for each Opus band (Nyquist frequency
// excluded). The size of each band is specified in
// |kOpusScaleNumBins24kHz20ms|.
constexpr std::array<float, kFrameSize20ms24kHz / 2> kOpusBandWeights24kHz20ms =
{{
0.f, 0.25f, 0.5f, 0.75f, // Band 0
0.f, 0.25f, 0.5f, 0.75f, // Band 1
0.f, 0.25f, 0.5f, 0.75f, // Band 2
0.f, 0.25f, 0.5f, 0.75f, // Band 3
0.f, 0.25f, 0.5f, 0.75f, // Band 4
0.f, 0.25f, 0.5f, 0.75f, // Band 5
0.f, 0.25f, 0.5f, 0.75f, // Band 6
0.f, 0.25f, 0.5f, 0.75f, // Band 7
0.f, 0.125f, 0.25f, 0.375f, 0.5f,
0.625f, 0.75f, 0.875f, // Band 8
0.f, 0.125f, 0.25f, 0.375f, 0.5f,
0.625f, 0.75f, 0.875f, // Band 9
0.f, 0.125f, 0.25f, 0.375f, 0.5f,
0.625f, 0.75f, 0.875f, // Band 10
0.f, 0.125f, 0.25f, 0.375f, 0.5f,
0.625f, 0.75f, 0.875f, // Band 11
0.f, 0.0625f, 0.125f, 0.1875f, 0.25f,
0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f,
0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f,
0.9375f, // Band 12
0.f, 0.0625f, 0.125f, 0.1875f, 0.25f,
0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f,
0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f,
0.9375f, // Band 13
0.f, 0.0625f, 0.125f, 0.1875f, 0.25f,
0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f,
0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f,
0.9375f, // Band 14
0.f, 0.0416667f, 0.0833333f, 0.125f, 0.166667f,
0.208333f, 0.25f, 0.291667f, 0.333333f, 0.375f,
0.416667f, 0.458333f, 0.5f, 0.541667f, 0.583333f,
0.625f, 0.666667f, 0.708333f, 0.75f, 0.791667f,
0.833333f, 0.875f, 0.916667f, 0.958333f, // Band 15
0.f, 0.0416667f, 0.0833333f, 0.125f, 0.166667f,
0.208333f, 0.25f, 0.291667f, 0.333333f, 0.375f,
0.416667f, 0.458333f, 0.5f, 0.541667f, 0.583333f,
0.625f, 0.666667f, 0.708333f, 0.75f, 0.791667f,
0.833333f, 0.875f, 0.916667f, 0.958333f, // Band 16
0.f, 0.03125f, 0.0625f, 0.09375f, 0.125f,
0.15625f, 0.1875f, 0.21875f, 0.25f, 0.28125f,
0.3125f, 0.34375f, 0.375f, 0.40625f, 0.4375f,
0.46875f, 0.5f, 0.53125f, 0.5625f, 0.59375f,
0.625f, 0.65625f, 0.6875f, 0.71875f, 0.75f,
0.78125f, 0.8125f, 0.84375f, 0.875f, 0.90625f,
0.9375f, 0.96875f, // Band 17
0.f, 0.0208333f, 0.0416667f, 0.0625f, 0.0833333f,
0.104167f, 0.125f, 0.145833f, 0.166667f, 0.1875f,
0.208333f, 0.229167f, 0.25f, 0.270833f, 0.291667f,
0.3125f, 0.333333f, 0.354167f, 0.375f, 0.395833f,
0.416667f, 0.4375f, 0.458333f, 0.479167f, 0.5f,
0.520833f, 0.541667f, 0.5625f, 0.583333f, 0.604167f,
0.625f, 0.645833f, 0.666667f, 0.6875f, 0.708333f,
0.729167f, 0.75f, 0.770833f, 0.791667f, 0.8125f,
0.833333f, 0.854167f, 0.875f, 0.895833f, 0.916667f,
0.9375f, 0.958333f, 0.979167f // Band 18
}};
} // namespace
SpectralCorrelator::SpectralCorrelator()
: weights_(kOpusBandWeights24kHz20ms.begin(),
kOpusBandWeights24kHz20ms.end()) {}
SpectralCorrelator::~SpectralCorrelator() = default;
void SpectralCorrelator::ComputeAutoCorrelation(
rtc::ArrayView<const float> x,
rtc::ArrayView<float, kOpusBands24kHz> auto_corr) const {
ComputeCrossCorrelation(x, x, auto_corr);
}
void SpectralCorrelator::ComputeCrossCorrelation(
rtc::ArrayView<const float> x,
rtc::ArrayView<const float> y,
rtc::ArrayView<float, kOpusBands24kHz> cross_corr) const {
RTC_DCHECK_EQ(x.size(), kFrameSize20ms24kHz);
RTC_DCHECK_EQ(x.size(), y.size());
RTC_DCHECK_EQ(x[1], 0.f) << "The Nyquist coefficient must be zeroed.";
RTC_DCHECK_EQ(y[1], 0.f) << "The Nyquist coefficient must be zeroed.";
constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms();
size_t k = 0; // Next Fourier coefficient index.
cross_corr[0] = 0.f;
for (size_t i = 0; i < kOpusBands24kHz - 1; ++i) {
cross_corr[i + 1] = 0.f;
for (int j = 0; j < kOpusScaleNumBins24kHz20ms[i]; ++j) { // Band size.
const float v = x[2 * k] * y[2 * k] + x[2 * k + 1] * y[2 * k + 1];
const float tmp = weights_[k] * v;
cross_corr[i] += v - tmp;
cross_corr[i + 1] += tmp;
k++;
}
}
cross_corr[0] *= 2.f; // The first band only gets half contribution.
RTC_DCHECK_EQ(k, kFrameSize20ms24kHz / 2); // Nyquist coefficient never used.
}
void ComputeSmoothedLogMagnitudeSpectrum(
rtc::ArrayView<const float> bands_energy,
rtc::ArrayView<float, kNumBands> log_bands_energy) {
RTC_DCHECK_LE(bands_energy.size(), kNumBands);
constexpr float kOneByHundred = 1e-2f;
constexpr float kLogOneByHundred = -2.f;
// Init.
float log_max = kLogOneByHundred;
float follow = kLogOneByHundred;
const auto smooth = [&log_max, &follow](float x) {
x = std::max(log_max - 7.f, std::max(follow - 1.5f, x));
log_max = std::max(log_max, x);
follow = std::max(follow - 1.5f, x);
return x;
};
// Smoothing over the bands for which the band energy is defined.
for (size_t i = 0; i < bands_energy.size(); ++i) {
log_bands_energy[i] = smooth(std::log10(kOneByHundred + bands_energy[i]));
}
// Smoothing over the remaining bands (zero energy).
for (size_t i = bands_energy.size(); i < kNumBands; ++i) {
log_bands_energy[i] = smooth(kLogOneByHundred);
}
}
std::array<float, kNumBands * kNumBands> ComputeDctTable() {
std::array<float, kNumBands * kNumBands> dct_table;
const double k = std::sqrt(0.5);
for (size_t i = 0; i < kNumBands; ++i) {
for (size_t j = 0; j < kNumBands; ++j)
dct_table[i * kNumBands + j] = std::cos((i + 0.5) * j * kPi / kNumBands);
dct_table[i * kNumBands] *= k;
}
return dct_table;
}
void ComputeDct(rtc::ArrayView<const float> in,
rtc::ArrayView<const float, kNumBands * kNumBands> dct_table,
rtc::ArrayView<float> out) {
// DCT scaling factor - i.e., sqrt(2 / kNumBands).
constexpr float kDctScalingFactor = 0.301511345f;
constexpr float kDctScalingFactorError =
kDctScalingFactor * kDctScalingFactor -
2.f / static_cast<float>(kNumBands);
static_assert(
(kDctScalingFactorError >= 0.f && kDctScalingFactorError < 1e-1f) ||
(kDctScalingFactorError < 0.f && kDctScalingFactorError > -1e-1f),
"kNumBands changed and kDctScalingFactor has not been updated.");
RTC_DCHECK_NE(in.data(), out.data()) << "In-place DCT is not supported.";
RTC_DCHECK_LE(in.size(), kNumBands);
RTC_DCHECK_LE(1, out.size());
RTC_DCHECK_LE(out.size(), in.size());
for (size_t i = 0; i < out.size(); ++i) {
out[i] = 0.f;
for (size_t j = 0; j < in.size(); ++j) {
out[i] += in[j] * dct_table[j * kNumBands + i];
}
// TODO(bugs.webrtc.org/10480): Scaling factor in the DCT table.
out[i] *= kDctScalingFactor;
}
}
} // namespace rnn_vad
} // namespace webrtc

View File

@ -0,0 +1,100 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SPECTRAL_FEATURES_INTERNAL_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SPECTRAL_FEATURES_INTERNAL_H_
#include <stddef.h>
#include <array>
#include <vector>
#include "api/array_view.h"
#include "modules/audio_processing/agc2/rnn_vad/common.h"
namespace webrtc {
namespace rnn_vad {
// At a sample rate of 24 kHz, the last 3 Opus bands are beyond the Nyquist
// frequency. However, band #19 gets the contributions from band #18 because
// of the symmetric triangular filter with peak response at 12 kHz.
constexpr size_t kOpusBands24kHz = 20;
static_assert(kOpusBands24kHz < kNumBands,
"The number of bands at 24 kHz must be less than those defined "
"in the Opus scale at 48 kHz.");
// Number of FFT frequency bins covered by each band in the Opus scale at a
// sample rate of 24 kHz for 20 ms frames.
// Declared here for unit testing.
constexpr std::array<int, kOpusBands24kHz - 1> GetOpusScaleNumBins24kHz20ms() {
return {4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 24, 24, 32, 48};
}
// TODO(bugs.webrtc.org/10480): Move to a separate file.
// Class to compute band-wise spectral features in the Opus perceptual scale
// for 20 ms frames sampled at 24 kHz. The analysis methods apply triangular
// filters with peak response at the each band boundary.
class SpectralCorrelator {
public:
// Ctor.
SpectralCorrelator();
SpectralCorrelator(const SpectralCorrelator&) = delete;
SpectralCorrelator& operator=(const SpectralCorrelator&) = delete;
~SpectralCorrelator();
// Computes the band-wise spectral auto-correlations.
// |x| must:
// - have size equal to |kFrameSize20ms24kHz|;
// - be encoded as vectors of interleaved real-complex FFT coefficients
// where x[1] = y[1] = 0 (the Nyquist frequency coefficient is omitted).
void ComputeAutoCorrelation(
rtc::ArrayView<const float> x,
rtc::ArrayView<float, kOpusBands24kHz> auto_corr) const;
// Computes the band-wise spectral cross-correlations.
// |x| and |y| must:
// - have size equal to |kFrameSize20ms24kHz|;
// - be encoded as vectors of interleaved real-complex FFT coefficients where
// x[1] = y[1] = 0 (the Nyquist frequency coefficient is omitted).
void ComputeCrossCorrelation(
rtc::ArrayView<const float> x,
rtc::ArrayView<const float> y,
rtc::ArrayView<float, kOpusBands24kHz> cross_corr) const;
private:
const std::vector<float> weights_; // Weights for each Fourier coefficient.
};
// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in
// spectral_features.cc. Given a vector of Opus-bands energy coefficients,
// computes the log magnitude spectrum applying smoothing both over time and
// over frequency. Declared here for unit testing.
void ComputeSmoothedLogMagnitudeSpectrum(
rtc::ArrayView<const float> bands_energy,
rtc::ArrayView<float, kNumBands> log_bands_energy);
// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in
// spectral_features.cc. Creates a DCT table for arrays having size equal to
// |kNumBands|. Declared here for unit testing.
std::array<float, kNumBands * kNumBands> ComputeDctTable();
// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in
// spectral_features.cc. Computes DCT for |in| given a pre-computed DCT table.
// In-place computation is not allowed and |out| can be smaller than |in| in
// order to only compute the first DCT coefficients. Declared here for unit
// testing.
void ComputeDct(rtc::ArrayView<const float> in,
rtc::ArrayView<const float, kNumBands * kNumBands> dct_table,
rtc::ArrayView<float> out);
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SPECTRAL_FEATURES_INTERNAL_H_

View File

@ -0,0 +1,94 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SYMMETRIC_MATRIX_BUFFER_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SYMMETRIC_MATRIX_BUFFER_H_
#include <algorithm>
#include <array>
#include <cstring>
#include <utility>
#include "api/array_view.h"
#include "rtc_base/checks.h"
namespace webrtc {
namespace rnn_vad {
// Data structure to buffer the results of pair-wise comparisons between items
// stored in a ring buffer. Every time that the oldest item is replaced in the
// ring buffer, the new one is compared to the remaining items in the ring
// buffer. The results of such comparisons need to be buffered and automatically
// removed when one of the two corresponding items that have been compared is
// removed from the ring buffer. It is assumed that the comparison is symmetric
// and that comparing an item with itself is not needed.
template <typename T, size_t S>
class SymmetricMatrixBuffer {
static_assert(S > 2, "");
public:
SymmetricMatrixBuffer() = default;
SymmetricMatrixBuffer(const SymmetricMatrixBuffer&) = delete;
SymmetricMatrixBuffer& operator=(const SymmetricMatrixBuffer&) = delete;
~SymmetricMatrixBuffer() = default;
// Sets the buffer values to zero.
void Reset() {
static_assert(std::is_arithmetic<T>::value,
"Integral or floating point required.");
buf_.fill(0);
}
// Pushes the results from the comparison between the most recent item and
// those that are still in the ring buffer. The first element in |values| must
// correspond to the comparison between the most recent item and the second
// most recent one in the ring buffer, whereas the last element in |values|
// must correspond to the comparison between the most recent item and the
// oldest one in the ring buffer.
void Push(rtc::ArrayView<T, S - 1> values) {
// Move the lower-right sub-matrix of size (S-2) x (S-2) one row up and one
// column left.
std::memmove(buf_.data(), buf_.data() + S, (buf_.size() - S) * sizeof(T));
// Copy new values in the last column in the right order.
for (size_t i = 0; i < values.size(); ++i) {
const size_t index = (S - 1 - i) * (S - 1) - 1;
RTC_DCHECK_LE(static_cast<size_t>(0), index);
RTC_DCHECK_LT(index, buf_.size());
buf_[index] = values[i];
}
}
// Reads the value that corresponds to comparison of two items in the ring
// buffer having delay |delay1| and |delay2|. The two arguments must not be
// equal and both must be in {0, ..., S - 1}.
T GetValue(size_t delay1, size_t delay2) const {
int row = S - 1 - static_cast<int>(delay1);
int col = S - 1 - static_cast<int>(delay2);
RTC_DCHECK_NE(row, col) << "The diagonal cannot be accessed.";
if (row > col)
std::swap(row, col); // Swap to access the upper-right triangular part.
RTC_DCHECK_LE(0, row);
RTC_DCHECK_LT(row, S - 1) << "Not enforcing row < col and row != col.";
RTC_DCHECK_LE(1, col) << "Not enforcing row < col and row != col.";
RTC_DCHECK_LT(col, S);
const int index = row * (S - 1) + (col - 1);
RTC_DCHECK_LE(0, index);
RTC_DCHECK_LT(index, buf_.size());
return buf_[index];
}
private:
// Encode an upper-right triangular matrix (excluding its diagonal) using a
// square matrix. This allows to move the data in Push() with one single
// operation.
std::array<T, (S - 1) * (S - 1)> buf_{};
};
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SYMMETRIC_MATRIX_BUFFER_H_

View File

@ -0,0 +1,129 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/rnn_vad/test_utils.h"
#include <memory>
#include "rtc_base/checks.h"
#include "rtc_base/system/arch.h"
#include "system_wrappers/include/cpu_features_wrapper.h"
#include "test/gtest.h"
#include "test/testsupport/file_utils.h"
namespace webrtc {
namespace rnn_vad {
namespace test {
namespace {
using ReaderPairType =
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>;
} // namespace
using webrtc::test::ResourcePath;
void ExpectEqualFloatArray(rtc::ArrayView<const float> expected,
rtc::ArrayView<const float> computed) {
ASSERT_EQ(expected.size(), computed.size());
for (size_t i = 0; i < expected.size(); ++i) {
SCOPED_TRACE(i);
EXPECT_FLOAT_EQ(expected[i], computed[i]);
}
}
void ExpectNearAbsolute(rtc::ArrayView<const float> expected,
rtc::ArrayView<const float> computed,
float tolerance) {
ASSERT_EQ(expected.size(), computed.size());
for (size_t i = 0; i < expected.size(); ++i) {
SCOPED_TRACE(i);
EXPECT_NEAR(expected[i], computed[i], tolerance);
}
}
std::pair<std::unique_ptr<BinaryFileReader<int16_t, float>>, const size_t>
CreatePcmSamplesReader(const size_t frame_length) {
auto ptr = std::make_unique<BinaryFileReader<int16_t, float>>(
test::ResourcePath("audio_processing/agc2/rnn_vad/samples", "pcm"),
frame_length);
// The last incomplete frame is ignored.
return {std::move(ptr), ptr->data_length() / frame_length};
}
ReaderPairType CreatePitchBuffer24kHzReader() {
constexpr size_t cols = 864;
auto ptr = std::make_unique<BinaryFileReader<float>>(
ResourcePath("audio_processing/agc2/rnn_vad/pitch_buf_24k", "dat"), cols);
return {std::move(ptr), rtc::CheckedDivExact(ptr->data_length(), cols)};
}
ReaderPairType CreateLpResidualAndPitchPeriodGainReader() {
constexpr size_t num_lp_residual_coeffs = 864;
auto ptr = std::make_unique<BinaryFileReader<float>>(
ResourcePath("audio_processing/agc2/rnn_vad/pitch_lp_res", "dat"),
num_lp_residual_coeffs);
return {std::move(ptr),
rtc::CheckedDivExact(ptr->data_length(), 2 + num_lp_residual_coeffs)};
}
ReaderPairType CreateVadProbsReader() {
auto ptr = std::make_unique<BinaryFileReader<float>>(
test::ResourcePath("audio_processing/agc2/rnn_vad/vad_prob", "dat"));
return {std::move(ptr), ptr->data_length()};
}
PitchTestData::PitchTestData() {
BinaryFileReader<float> test_data_reader(
ResourcePath("audio_processing/agc2/rnn_vad/pitch_search_int", "dat"),
static_cast<size_t>(1396));
test_data_reader.ReadChunk(test_data_);
}
PitchTestData::~PitchTestData() = default;
rtc::ArrayView<const float, kBufSize24kHz> PitchTestData::GetPitchBufView()
const {
return {test_data_.data(), kBufSize24kHz};
}
rtc::ArrayView<const float, kNumPitchBufSquareEnergies>
PitchTestData::GetPitchBufSquareEnergiesView() const {
return {test_data_.data() + kBufSize24kHz, kNumPitchBufSquareEnergies};
}
rtc::ArrayView<const float, kNumPitchBufAutoCorrCoeffs>
PitchTestData::GetPitchBufAutoCorrCoeffsView() const {
return {test_data_.data() + kBufSize24kHz + kNumPitchBufSquareEnergies,
kNumPitchBufAutoCorrCoeffs};
}
bool IsOptimizationAvailable(Optimization optimization) {
switch (optimization) {
case Optimization::kSse2:
#if defined(WEBRTC_ARCH_X86_FAMILY)
return GetCPUInfo(kSSE2) != 0;
#else
return false;
#endif
case Optimization::kNeon:
#if defined(WEBRTC_HAS_NEON)
return true;
#else
return false;
#endif
case Optimization::kNone:
return true;
}
}
} // namespace test
} // namespace rnn_vad
} // namespace webrtc

View File

@ -0,0 +1,161 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_TEST_UTILS_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_TEST_UTILS_H_
#include <algorithm>
#include <array>
#include <fstream>
#include <limits>
#include <memory>
#include <string>
#include <type_traits>
#include <utility>
#include <vector>
#include "api/array_view.h"
#include "modules/audio_processing/agc2/rnn_vad/common.h"
#include "rtc_base/checks.h"
namespace webrtc {
namespace rnn_vad {
namespace test {
constexpr float kFloatMin = std::numeric_limits<float>::min();
// Fails for every pair from two equally sized rtc::ArrayView<float> views such
// that the values in the pair do not match.
void ExpectEqualFloatArray(rtc::ArrayView<const float> expected,
rtc::ArrayView<const float> computed);
// Fails for every pair from two equally sized rtc::ArrayView<float> views such
// that their absolute error is above a given threshold.
void ExpectNearAbsolute(rtc::ArrayView<const float> expected,
rtc::ArrayView<const float> computed,
float tolerance);
// Reader for binary files consisting of an arbitrary long sequence of elements
// having type T. It is possible to read and cast to another type D at once.
template <typename T, typename D = T>
class BinaryFileReader {
public:
explicit BinaryFileReader(const std::string& file_path, size_t chunk_size = 0)
: is_(file_path, std::ios::binary | std::ios::ate),
data_length_(is_.tellg() / sizeof(T)),
chunk_size_(chunk_size) {
RTC_CHECK(is_);
SeekBeginning();
buf_.resize(chunk_size_);
}
BinaryFileReader(const BinaryFileReader&) = delete;
BinaryFileReader& operator=(const BinaryFileReader&) = delete;
~BinaryFileReader() = default;
size_t data_length() const { return data_length_; }
bool ReadValue(D* dst) {
if (std::is_same<T, D>::value) {
is_.read(reinterpret_cast<char*>(dst), sizeof(T));
} else {
T v;
is_.read(reinterpret_cast<char*>(&v), sizeof(T));
*dst = static_cast<D>(v);
}
return is_.gcount() == sizeof(T);
}
// If |chunk_size| was specified in the ctor, it will check that the size of
// |dst| equals |chunk_size|.
bool ReadChunk(rtc::ArrayView<D> dst) {
RTC_DCHECK((chunk_size_ == 0) || (chunk_size_ == dst.size()));
const std::streamsize bytes_to_read = dst.size() * sizeof(T);
if (std::is_same<T, D>::value) {
is_.read(reinterpret_cast<char*>(dst.data()), bytes_to_read);
} else {
is_.read(reinterpret_cast<char*>(buf_.data()), bytes_to_read);
std::transform(buf_.begin(), buf_.end(), dst.begin(),
[](const T& v) -> D { return static_cast<D>(v); });
}
return is_.gcount() == bytes_to_read;
}
void SeekForward(size_t items) { is_.seekg(items * sizeof(T), is_.cur); }
void SeekBeginning() { is_.seekg(0, is_.beg); }
private:
std::ifstream is_;
const size_t data_length_;
const size_t chunk_size_;
std::vector<T> buf_;
};
// Writer for binary files.
template <typename T>
class BinaryFileWriter {
public:
explicit BinaryFileWriter(const std::string& file_path)
: os_(file_path, std::ios::binary) {}
BinaryFileWriter(const BinaryFileWriter&) = delete;
BinaryFileWriter& operator=(const BinaryFileWriter&) = delete;
~BinaryFileWriter() = default;
static_assert(std::is_arithmetic<T>::value, "");
void WriteChunk(rtc::ArrayView<const T> value) {
const std::streamsize bytes_to_write = value.size() * sizeof(T);
os_.write(reinterpret_cast<const char*>(value.data()), bytes_to_write);
}
private:
std::ofstream os_;
};
// Factories for resource file readers.
// The functions below return a pair where the first item is a reader unique
// pointer and the second the number of chunks that can be read from the file.
// Creates a reader for the PCM samples that casts from S16 to float and reads
// chunks with length |frame_length|.
std::pair<std::unique_ptr<BinaryFileReader<int16_t, float>>, const size_t>
CreatePcmSamplesReader(const size_t frame_length);
// Creates a reader for the pitch buffer content at 24 kHz.
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
CreatePitchBuffer24kHzReader();
// Creates a reader for the the LP residual coefficients and the pitch period
// and gain values.
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
CreateLpResidualAndPitchPeriodGainReader();
// Creates a reader for the VAD probabilities.
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
CreateVadProbsReader();
constexpr size_t kNumPitchBufAutoCorrCoeffs = 147;
constexpr size_t kNumPitchBufSquareEnergies = 385;
constexpr size_t kPitchTestDataSize =
kBufSize24kHz + kNumPitchBufSquareEnergies + kNumPitchBufAutoCorrCoeffs;
// Class to retrieve a test pitch buffer content and the expected output for the
// analysis steps.
class PitchTestData {
public:
PitchTestData();
~PitchTestData();
rtc::ArrayView<const float, kBufSize24kHz> GetPitchBufView() const;
rtc::ArrayView<const float, kNumPitchBufSquareEnergies>
GetPitchBufSquareEnergiesView() const;
rtc::ArrayView<const float, kNumPitchBufAutoCorrCoeffs>
GetPitchBufAutoCorrCoeffsView() const;
private:
std::array<float, kPitchTestDataSize> test_data_;
};
// Returns true if the given optimization is available.
bool IsOptimizationAvailable(Optimization optimization);
} // namespace test
} // namespace rnn_vad
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_TEST_UTILS_H_