Update to current webrtc library
This is from the upstream library commit id 3326535126e435f1ba647885ce43a8f0f3d317eb, corresponding to Chromium 88.0.4290.1.
This commit is contained in:
425
webrtc/modules/audio_processing/agc2/rnn_vad/rnn.cc
Normal file
425
webrtc/modules/audio_processing/agc2/rnn_vad/rnn.cc
Normal file
@ -0,0 +1,425 @@
|
||||
/*
|
||||
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "modules/audio_processing/agc2/rnn_vad/rnn.h"
|
||||
|
||||
// Defines WEBRTC_ARCH_X86_FAMILY, used below.
|
||||
#include "rtc_base/system/arch.h"
|
||||
|
||||
#if defined(WEBRTC_HAS_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
|
||||
#include "rtc_base/checks.h"
|
||||
#include "rtc_base/logging.h"
|
||||
#include "third_party/rnnoise/src/rnn_activations.h"
|
||||
#include "third_party/rnnoise/src/rnn_vad_weights.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace rnn_vad {
|
||||
namespace {
|
||||
|
||||
using rnnoise::kWeightsScale;
|
||||
|
||||
using rnnoise::kInputLayerInputSize;
|
||||
static_assert(kFeatureVectorSize == kInputLayerInputSize, "");
|
||||
using rnnoise::kInputDenseBias;
|
||||
using rnnoise::kInputDenseWeights;
|
||||
using rnnoise::kInputLayerOutputSize;
|
||||
static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
|
||||
"Increase kFullyConnectedLayersMaxUnits.");
|
||||
|
||||
using rnnoise::kHiddenGruBias;
|
||||
using rnnoise::kHiddenGruRecurrentWeights;
|
||||
using rnnoise::kHiddenGruWeights;
|
||||
using rnnoise::kHiddenLayerOutputSize;
|
||||
static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits,
|
||||
"Increase kRecurrentLayersMaxUnits.");
|
||||
|
||||
using rnnoise::kOutputDenseBias;
|
||||
using rnnoise::kOutputDenseWeights;
|
||||
using rnnoise::kOutputLayerOutputSize;
|
||||
static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
|
||||
"Increase kFullyConnectedLayersMaxUnits.");
|
||||
|
||||
using rnnoise::SigmoidApproximated;
|
||||
using rnnoise::TansigApproximated;
|
||||
|
||||
inline float RectifiedLinearUnit(float x) {
|
||||
return x < 0.f ? 0.f : x;
|
||||
}
|
||||
|
||||
std::vector<float> GetScaledParams(rtc::ArrayView<const int8_t> params) {
|
||||
std::vector<float> scaled_params(params.size());
|
||||
std::transform(params.begin(), params.end(), scaled_params.begin(),
|
||||
[](int8_t x) -> float {
|
||||
return rnnoise::kWeightsScale * static_cast<float>(x);
|
||||
});
|
||||
return scaled_params;
|
||||
}
|
||||
|
||||
// TODO(bugs.chromium.org/10480): Hard-code optimized layout and remove this
|
||||
// function to improve setup time.
|
||||
// Casts and scales |weights| and re-arranges the layout.
|
||||
std::vector<float> GetPreprocessedFcWeights(
|
||||
rtc::ArrayView<const int8_t> weights,
|
||||
size_t output_size) {
|
||||
if (output_size == 1) {
|
||||
return GetScaledParams(weights);
|
||||
}
|
||||
// Transpose, scale and cast.
|
||||
const size_t input_size = rtc::CheckedDivExact(weights.size(), output_size);
|
||||
std::vector<float> w(weights.size());
|
||||
for (size_t o = 0; o < output_size; ++o) {
|
||||
for (size_t i = 0; i < input_size; ++i) {
|
||||
w[o * input_size + i] = rnnoise::kWeightsScale *
|
||||
static_cast<float>(weights[i * output_size + o]);
|
||||
}
|
||||
}
|
||||
return w;
|
||||
}
|
||||
|
||||
constexpr size_t kNumGruGates = 3; // Update, reset, output.
|
||||
|
||||
// TODO(bugs.chromium.org/10480): Hard-coded optimized layout and remove this
|
||||
// function to improve setup time.
|
||||
// Casts and scales |tensor_src| for a GRU layer and re-arranges the layout.
|
||||
// It works both for weights, recurrent weights and bias.
|
||||
std::vector<float> GetPreprocessedGruTensor(
|
||||
rtc::ArrayView<const int8_t> tensor_src,
|
||||
size_t output_size) {
|
||||
// Transpose, cast and scale.
|
||||
// |n| is the size of the first dimension of the 3-dim tensor |weights|.
|
||||
const size_t n =
|
||||
rtc::CheckedDivExact(tensor_src.size(), output_size * kNumGruGates);
|
||||
const size_t stride_src = kNumGruGates * output_size;
|
||||
const size_t stride_dst = n * output_size;
|
||||
std::vector<float> tensor_dst(tensor_src.size());
|
||||
for (size_t g = 0; g < kNumGruGates; ++g) {
|
||||
for (size_t o = 0; o < output_size; ++o) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
tensor_dst[g * stride_dst + o * n + i] =
|
||||
rnnoise::kWeightsScale *
|
||||
static_cast<float>(
|
||||
tensor_src[i * stride_src + g * output_size + o]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return tensor_dst;
|
||||
}
|
||||
|
||||
void ComputeGruUpdateResetGates(size_t input_size,
|
||||
size_t output_size,
|
||||
rtc::ArrayView<const float> weights,
|
||||
rtc::ArrayView<const float> recurrent_weights,
|
||||
rtc::ArrayView<const float> bias,
|
||||
rtc::ArrayView<const float> input,
|
||||
rtc::ArrayView<const float> state,
|
||||
rtc::ArrayView<float> gate) {
|
||||
for (size_t o = 0; o < output_size; ++o) {
|
||||
gate[o] = bias[o];
|
||||
for (size_t i = 0; i < input_size; ++i) {
|
||||
gate[o] += input[i] * weights[o * input_size + i];
|
||||
}
|
||||
for (size_t s = 0; s < output_size; ++s) {
|
||||
gate[o] += state[s] * recurrent_weights[o * output_size + s];
|
||||
}
|
||||
gate[o] = SigmoidApproximated(gate[o]);
|
||||
}
|
||||
}
|
||||
|
||||
void ComputeGruOutputGate(size_t input_size,
|
||||
size_t output_size,
|
||||
rtc::ArrayView<const float> weights,
|
||||
rtc::ArrayView<const float> recurrent_weights,
|
||||
rtc::ArrayView<const float> bias,
|
||||
rtc::ArrayView<const float> input,
|
||||
rtc::ArrayView<const float> state,
|
||||
rtc::ArrayView<const float> reset,
|
||||
rtc::ArrayView<float> gate) {
|
||||
for (size_t o = 0; o < output_size; ++o) {
|
||||
gate[o] = bias[o];
|
||||
for (size_t i = 0; i < input_size; ++i) {
|
||||
gate[o] += input[i] * weights[o * input_size + i];
|
||||
}
|
||||
for (size_t s = 0; s < output_size; ++s) {
|
||||
gate[o] += state[s] * recurrent_weights[o * output_size + s] * reset[s];
|
||||
}
|
||||
gate[o] = RectifiedLinearUnit(gate[o]);
|
||||
}
|
||||
}
|
||||
|
||||
// Gated recurrent unit (GRU) layer un-optimized implementation.
|
||||
void ComputeGruLayerOutput(size_t input_size,
|
||||
size_t output_size,
|
||||
rtc::ArrayView<const float> input,
|
||||
rtc::ArrayView<const float> weights,
|
||||
rtc::ArrayView<const float> recurrent_weights,
|
||||
rtc::ArrayView<const float> bias,
|
||||
rtc::ArrayView<float> state) {
|
||||
RTC_DCHECK_EQ(input_size, input.size());
|
||||
// Stride and offset used to read parameter arrays.
|
||||
const size_t stride_in = input_size * output_size;
|
||||
const size_t stride_out = output_size * output_size;
|
||||
|
||||
// Update gate.
|
||||
std::array<float, kRecurrentLayersMaxUnits> update;
|
||||
ComputeGruUpdateResetGates(
|
||||
input_size, output_size, weights.subview(0, stride_in),
|
||||
recurrent_weights.subview(0, stride_out), bias.subview(0, output_size),
|
||||
input, state, update);
|
||||
|
||||
// Reset gate.
|
||||
std::array<float, kRecurrentLayersMaxUnits> reset;
|
||||
ComputeGruUpdateResetGates(
|
||||
input_size, output_size, weights.subview(stride_in, stride_in),
|
||||
recurrent_weights.subview(stride_out, stride_out),
|
||||
bias.subview(output_size, output_size), input, state, reset);
|
||||
|
||||
// Output gate.
|
||||
std::array<float, kRecurrentLayersMaxUnits> output;
|
||||
ComputeGruOutputGate(
|
||||
input_size, output_size, weights.subview(2 * stride_in, stride_in),
|
||||
recurrent_weights.subview(2 * stride_out, stride_out),
|
||||
bias.subview(2 * output_size, output_size), input, state, reset, output);
|
||||
|
||||
// Update output through the update gates and update the state.
|
||||
for (size_t o = 0; o < output_size; ++o) {
|
||||
output[o] = update[o] * state[o] + (1.f - update[o]) * output[o];
|
||||
state[o] = output[o];
|
||||
}
|
||||
}
|
||||
|
||||
// Fully connected layer un-optimized implementation.
|
||||
void ComputeFullyConnectedLayerOutput(
|
||||
size_t input_size,
|
||||
size_t output_size,
|
||||
rtc::ArrayView<const float> input,
|
||||
rtc::ArrayView<const float> bias,
|
||||
rtc::ArrayView<const float> weights,
|
||||
rtc::FunctionView<float(float)> activation_function,
|
||||
rtc::ArrayView<float> output) {
|
||||
RTC_DCHECK_EQ(input.size(), input_size);
|
||||
RTC_DCHECK_EQ(bias.size(), output_size);
|
||||
RTC_DCHECK_EQ(weights.size(), input_size * output_size);
|
||||
for (size_t o = 0; o < output_size; ++o) {
|
||||
output[o] = bias[o];
|
||||
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
|
||||
// |weights_| change the performance across different platforms.
|
||||
for (size_t i = 0; i < input_size; ++i) {
|
||||
output[o] += input[i] * weights[o * input_size + i];
|
||||
}
|
||||
output[o] = activation_function(output[o]);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
// Fully connected layer SSE2 implementation.
|
||||
void ComputeFullyConnectedLayerOutputSse2(
|
||||
size_t input_size,
|
||||
size_t output_size,
|
||||
rtc::ArrayView<const float> input,
|
||||
rtc::ArrayView<const float> bias,
|
||||
rtc::ArrayView<const float> weights,
|
||||
rtc::FunctionView<float(float)> activation_function,
|
||||
rtc::ArrayView<float> output) {
|
||||
RTC_DCHECK_EQ(input.size(), input_size);
|
||||
RTC_DCHECK_EQ(bias.size(), output_size);
|
||||
RTC_DCHECK_EQ(weights.size(), input_size * output_size);
|
||||
const size_t input_size_by_4 = input_size >> 2;
|
||||
const size_t offset = input_size & ~3;
|
||||
__m128 sum_wx_128;
|
||||
const float* v = reinterpret_cast<const float*>(&sum_wx_128);
|
||||
for (size_t o = 0; o < output_size; ++o) {
|
||||
// Perform 128 bit vector operations.
|
||||
sum_wx_128 = _mm_set1_ps(0);
|
||||
const float* x_p = input.data();
|
||||
const float* w_p = weights.data() + o * input_size;
|
||||
for (size_t i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
|
||||
sum_wx_128 = _mm_add_ps(sum_wx_128,
|
||||
_mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
|
||||
}
|
||||
// Perform non-vector operations for any remaining items, sum up bias term
|
||||
// and results from the vectorized code, and apply the activation function.
|
||||
output[o] = activation_function(
|
||||
std::inner_product(input.begin() + offset, input.end(),
|
||||
weights.begin() + o * input_size + offset,
|
||||
bias[o] + v[0] + v[1] + v[2] + v[3]));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
FullyConnectedLayer::FullyConnectedLayer(
|
||||
const size_t input_size,
|
||||
const size_t output_size,
|
||||
const rtc::ArrayView<const int8_t> bias,
|
||||
const rtc::ArrayView<const int8_t> weights,
|
||||
rtc::FunctionView<float(float)> activation_function,
|
||||
Optimization optimization)
|
||||
: input_size_(input_size),
|
||||
output_size_(output_size),
|
||||
bias_(GetScaledParams(bias)),
|
||||
weights_(GetPreprocessedFcWeights(weights, output_size)),
|
||||
activation_function_(activation_function),
|
||||
optimization_(optimization) {
|
||||
RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
|
||||
<< "Static over-allocation of fully-connected layers output vectors is "
|
||||
"not sufficient.";
|
||||
RTC_DCHECK_EQ(output_size_, bias_.size())
|
||||
<< "Mismatching output size and bias terms array size.";
|
||||
RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
|
||||
<< "Mismatching input-output size and weight coefficients array size.";
|
||||
}
|
||||
|
||||
FullyConnectedLayer::~FullyConnectedLayer() = default;
|
||||
|
||||
rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
|
||||
return rtc::ArrayView<const float>(output_.data(), output_size_);
|
||||
}
|
||||
|
||||
void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
||||
switch (optimization_) {
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
case Optimization::kSse2:
|
||||
ComputeFullyConnectedLayerOutputSse2(input_size_, output_size_, input,
|
||||
bias_, weights_,
|
||||
activation_function_, output_);
|
||||
break;
|
||||
#endif
|
||||
#if defined(WEBRTC_HAS_NEON)
|
||||
case Optimization::kNeon:
|
||||
// TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
|
||||
ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
|
||||
weights_, activation_function_, output_);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
|
||||
weights_, activation_function_, output_);
|
||||
}
|
||||
}
|
||||
|
||||
GatedRecurrentLayer::GatedRecurrentLayer(
|
||||
const size_t input_size,
|
||||
const size_t output_size,
|
||||
const rtc::ArrayView<const int8_t> bias,
|
||||
const rtc::ArrayView<const int8_t> weights,
|
||||
const rtc::ArrayView<const int8_t> recurrent_weights,
|
||||
Optimization optimization)
|
||||
: input_size_(input_size),
|
||||
output_size_(output_size),
|
||||
bias_(GetPreprocessedGruTensor(bias, output_size)),
|
||||
weights_(GetPreprocessedGruTensor(weights, output_size)),
|
||||
recurrent_weights_(
|
||||
GetPreprocessedGruTensor(recurrent_weights, output_size)),
|
||||
optimization_(optimization) {
|
||||
RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
|
||||
<< "Static over-allocation of recurrent layers state vectors is not "
|
||||
"sufficient.";
|
||||
RTC_DCHECK_EQ(kNumGruGates * output_size_, bias_.size())
|
||||
<< "Mismatching output size and bias terms array size.";
|
||||
RTC_DCHECK_EQ(kNumGruGates * input_size_ * output_size_, weights_.size())
|
||||
<< "Mismatching input-output size and weight coefficients array size.";
|
||||
RTC_DCHECK_EQ(kNumGruGates * output_size_ * output_size_,
|
||||
recurrent_weights_.size())
|
||||
<< "Mismatching input-output size and recurrent weight coefficients array"
|
||||
" size.";
|
||||
Reset();
|
||||
}
|
||||
|
||||
GatedRecurrentLayer::~GatedRecurrentLayer() = default;
|
||||
|
||||
rtc::ArrayView<const float> GatedRecurrentLayer::GetOutput() const {
|
||||
return rtc::ArrayView<const float>(state_.data(), output_size_);
|
||||
}
|
||||
|
||||
void GatedRecurrentLayer::Reset() {
|
||||
state_.fill(0.f);
|
||||
}
|
||||
|
||||
void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
||||
switch (optimization_) {
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
case Optimization::kSse2:
|
||||
// TODO(bugs.chromium.org/10480): Handle Optimization::kSse2.
|
||||
ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
|
||||
recurrent_weights_, bias_, state_);
|
||||
break;
|
||||
#endif
|
||||
#if defined(WEBRTC_HAS_NEON)
|
||||
case Optimization::kNeon:
|
||||
// TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
|
||||
ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
|
||||
recurrent_weights_, bias_, state_);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
|
||||
recurrent_weights_, bias_, state_);
|
||||
}
|
||||
}
|
||||
|
||||
RnnBasedVad::RnnBasedVad()
|
||||
: input_layer_(kInputLayerInputSize,
|
||||
kInputLayerOutputSize,
|
||||
kInputDenseBias,
|
||||
kInputDenseWeights,
|
||||
TansigApproximated,
|
||||
DetectOptimization()),
|
||||
hidden_layer_(kInputLayerOutputSize,
|
||||
kHiddenLayerOutputSize,
|
||||
kHiddenGruBias,
|
||||
kHiddenGruWeights,
|
||||
kHiddenGruRecurrentWeights,
|
||||
DetectOptimization()),
|
||||
output_layer_(kHiddenLayerOutputSize,
|
||||
kOutputLayerOutputSize,
|
||||
kOutputDenseBias,
|
||||
kOutputDenseWeights,
|
||||
SigmoidApproximated,
|
||||
DetectOptimization()) {
|
||||
// Input-output chaining size checks.
|
||||
RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
|
||||
<< "The input and the hidden layers sizes do not match.";
|
||||
RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size())
|
||||
<< "The hidden and the output layers sizes do not match.";
|
||||
}
|
||||
|
||||
RnnBasedVad::~RnnBasedVad() = default;
|
||||
|
||||
void RnnBasedVad::Reset() {
|
||||
hidden_layer_.Reset();
|
||||
}
|
||||
|
||||
float RnnBasedVad::ComputeVadProbability(
|
||||
rtc::ArrayView<const float, kFeatureVectorSize> feature_vector,
|
||||
bool is_silence) {
|
||||
if (is_silence) {
|
||||
Reset();
|
||||
return 0.f;
|
||||
}
|
||||
input_layer_.ComputeOutput(feature_vector);
|
||||
hidden_layer_.ComputeOutput(input_layer_.GetOutput());
|
||||
output_layer_.ComputeOutput(hidden_layer_.GetOutput());
|
||||
const auto vad_output = output_layer_.GetOutput();
|
||||
return vad_output[0];
|
||||
}
|
||||
|
||||
} // namespace rnn_vad
|
||||
} // namespace webrtc
|
Reference in New Issue
Block a user