This is from the upstream library commit id 3326535126e435f1ba647885ce43a8f0f3d317eb, corresponding to Chromium 88.0.4290.1.
426 lines
16 KiB
C++
426 lines
16 KiB
C++
/*
|
|
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "modules/audio_processing/agc2/rnn_vad/rnn.h"
|
|
|
|
// Defines WEBRTC_ARCH_X86_FAMILY, used below.
|
|
#include "rtc_base/system/arch.h"
|
|
|
|
#if defined(WEBRTC_HAS_NEON)
|
|
#include <arm_neon.h>
|
|
#endif
|
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
|
#include <emmintrin.h>
|
|
#endif
|
|
#include <algorithm>
|
|
#include <array>
|
|
#include <cmath>
|
|
#include <numeric>
|
|
|
|
#include "rtc_base/checks.h"
|
|
#include "rtc_base/logging.h"
|
|
#include "third_party/rnnoise/src/rnn_activations.h"
|
|
#include "third_party/rnnoise/src/rnn_vad_weights.h"
|
|
|
|
namespace webrtc {
|
|
namespace rnn_vad {
|
|
namespace {
|
|
|
|
using rnnoise::kWeightsScale;
|
|
|
|
using rnnoise::kInputLayerInputSize;
|
|
static_assert(kFeatureVectorSize == kInputLayerInputSize, "");
|
|
using rnnoise::kInputDenseBias;
|
|
using rnnoise::kInputDenseWeights;
|
|
using rnnoise::kInputLayerOutputSize;
|
|
static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
|
|
"Increase kFullyConnectedLayersMaxUnits.");
|
|
|
|
using rnnoise::kHiddenGruBias;
|
|
using rnnoise::kHiddenGruRecurrentWeights;
|
|
using rnnoise::kHiddenGruWeights;
|
|
using rnnoise::kHiddenLayerOutputSize;
|
|
static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits,
|
|
"Increase kRecurrentLayersMaxUnits.");
|
|
|
|
using rnnoise::kOutputDenseBias;
|
|
using rnnoise::kOutputDenseWeights;
|
|
using rnnoise::kOutputLayerOutputSize;
|
|
static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
|
|
"Increase kFullyConnectedLayersMaxUnits.");
|
|
|
|
using rnnoise::SigmoidApproximated;
|
|
using rnnoise::TansigApproximated;
|
|
|
|
inline float RectifiedLinearUnit(float x) {
|
|
return x < 0.f ? 0.f : x;
|
|
}
|
|
|
|
std::vector<float> GetScaledParams(rtc::ArrayView<const int8_t> params) {
|
|
std::vector<float> scaled_params(params.size());
|
|
std::transform(params.begin(), params.end(), scaled_params.begin(),
|
|
[](int8_t x) -> float {
|
|
return rnnoise::kWeightsScale * static_cast<float>(x);
|
|
});
|
|
return scaled_params;
|
|
}
|
|
|
|
// TODO(bugs.chromium.org/10480): Hard-code optimized layout and remove this
|
|
// function to improve setup time.
|
|
// Casts and scales |weights| and re-arranges the layout.
|
|
std::vector<float> GetPreprocessedFcWeights(
|
|
rtc::ArrayView<const int8_t> weights,
|
|
size_t output_size) {
|
|
if (output_size == 1) {
|
|
return GetScaledParams(weights);
|
|
}
|
|
// Transpose, scale and cast.
|
|
const size_t input_size = rtc::CheckedDivExact(weights.size(), output_size);
|
|
std::vector<float> w(weights.size());
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
for (size_t i = 0; i < input_size; ++i) {
|
|
w[o * input_size + i] = rnnoise::kWeightsScale *
|
|
static_cast<float>(weights[i * output_size + o]);
|
|
}
|
|
}
|
|
return w;
|
|
}
|
|
|
|
constexpr size_t kNumGruGates = 3; // Update, reset, output.
|
|
|
|
// TODO(bugs.chromium.org/10480): Hard-coded optimized layout and remove this
|
|
// function to improve setup time.
|
|
// Casts and scales |tensor_src| for a GRU layer and re-arranges the layout.
|
|
// It works both for weights, recurrent weights and bias.
|
|
std::vector<float> GetPreprocessedGruTensor(
|
|
rtc::ArrayView<const int8_t> tensor_src,
|
|
size_t output_size) {
|
|
// Transpose, cast and scale.
|
|
// |n| is the size of the first dimension of the 3-dim tensor |weights|.
|
|
const size_t n =
|
|
rtc::CheckedDivExact(tensor_src.size(), output_size * kNumGruGates);
|
|
const size_t stride_src = kNumGruGates * output_size;
|
|
const size_t stride_dst = n * output_size;
|
|
std::vector<float> tensor_dst(tensor_src.size());
|
|
for (size_t g = 0; g < kNumGruGates; ++g) {
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
for (size_t i = 0; i < n; ++i) {
|
|
tensor_dst[g * stride_dst + o * n + i] =
|
|
rnnoise::kWeightsScale *
|
|
static_cast<float>(
|
|
tensor_src[i * stride_src + g * output_size + o]);
|
|
}
|
|
}
|
|
}
|
|
return tensor_dst;
|
|
}
|
|
|
|
void ComputeGruUpdateResetGates(size_t input_size,
|
|
size_t output_size,
|
|
rtc::ArrayView<const float> weights,
|
|
rtc::ArrayView<const float> recurrent_weights,
|
|
rtc::ArrayView<const float> bias,
|
|
rtc::ArrayView<const float> input,
|
|
rtc::ArrayView<const float> state,
|
|
rtc::ArrayView<float> gate) {
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
gate[o] = bias[o];
|
|
for (size_t i = 0; i < input_size; ++i) {
|
|
gate[o] += input[i] * weights[o * input_size + i];
|
|
}
|
|
for (size_t s = 0; s < output_size; ++s) {
|
|
gate[o] += state[s] * recurrent_weights[o * output_size + s];
|
|
}
|
|
gate[o] = SigmoidApproximated(gate[o]);
|
|
}
|
|
}
|
|
|
|
void ComputeGruOutputGate(size_t input_size,
|
|
size_t output_size,
|
|
rtc::ArrayView<const float> weights,
|
|
rtc::ArrayView<const float> recurrent_weights,
|
|
rtc::ArrayView<const float> bias,
|
|
rtc::ArrayView<const float> input,
|
|
rtc::ArrayView<const float> state,
|
|
rtc::ArrayView<const float> reset,
|
|
rtc::ArrayView<float> gate) {
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
gate[o] = bias[o];
|
|
for (size_t i = 0; i < input_size; ++i) {
|
|
gate[o] += input[i] * weights[o * input_size + i];
|
|
}
|
|
for (size_t s = 0; s < output_size; ++s) {
|
|
gate[o] += state[s] * recurrent_weights[o * output_size + s] * reset[s];
|
|
}
|
|
gate[o] = RectifiedLinearUnit(gate[o]);
|
|
}
|
|
}
|
|
|
|
// Gated recurrent unit (GRU) layer un-optimized implementation.
|
|
void ComputeGruLayerOutput(size_t input_size,
|
|
size_t output_size,
|
|
rtc::ArrayView<const float> input,
|
|
rtc::ArrayView<const float> weights,
|
|
rtc::ArrayView<const float> recurrent_weights,
|
|
rtc::ArrayView<const float> bias,
|
|
rtc::ArrayView<float> state) {
|
|
RTC_DCHECK_EQ(input_size, input.size());
|
|
// Stride and offset used to read parameter arrays.
|
|
const size_t stride_in = input_size * output_size;
|
|
const size_t stride_out = output_size * output_size;
|
|
|
|
// Update gate.
|
|
std::array<float, kRecurrentLayersMaxUnits> update;
|
|
ComputeGruUpdateResetGates(
|
|
input_size, output_size, weights.subview(0, stride_in),
|
|
recurrent_weights.subview(0, stride_out), bias.subview(0, output_size),
|
|
input, state, update);
|
|
|
|
// Reset gate.
|
|
std::array<float, kRecurrentLayersMaxUnits> reset;
|
|
ComputeGruUpdateResetGates(
|
|
input_size, output_size, weights.subview(stride_in, stride_in),
|
|
recurrent_weights.subview(stride_out, stride_out),
|
|
bias.subview(output_size, output_size), input, state, reset);
|
|
|
|
// Output gate.
|
|
std::array<float, kRecurrentLayersMaxUnits> output;
|
|
ComputeGruOutputGate(
|
|
input_size, output_size, weights.subview(2 * stride_in, stride_in),
|
|
recurrent_weights.subview(2 * stride_out, stride_out),
|
|
bias.subview(2 * output_size, output_size), input, state, reset, output);
|
|
|
|
// Update output through the update gates and update the state.
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
output[o] = update[o] * state[o] + (1.f - update[o]) * output[o];
|
|
state[o] = output[o];
|
|
}
|
|
}
|
|
|
|
// Fully connected layer un-optimized implementation.
|
|
void ComputeFullyConnectedLayerOutput(
|
|
size_t input_size,
|
|
size_t output_size,
|
|
rtc::ArrayView<const float> input,
|
|
rtc::ArrayView<const float> bias,
|
|
rtc::ArrayView<const float> weights,
|
|
rtc::FunctionView<float(float)> activation_function,
|
|
rtc::ArrayView<float> output) {
|
|
RTC_DCHECK_EQ(input.size(), input_size);
|
|
RTC_DCHECK_EQ(bias.size(), output_size);
|
|
RTC_DCHECK_EQ(weights.size(), input_size * output_size);
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
output[o] = bias[o];
|
|
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
|
|
// |weights_| change the performance across different platforms.
|
|
for (size_t i = 0; i < input_size; ++i) {
|
|
output[o] += input[i] * weights[o * input_size + i];
|
|
}
|
|
output[o] = activation_function(output[o]);
|
|
}
|
|
}
|
|
|
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
|
// Fully connected layer SSE2 implementation.
|
|
void ComputeFullyConnectedLayerOutputSse2(
|
|
size_t input_size,
|
|
size_t output_size,
|
|
rtc::ArrayView<const float> input,
|
|
rtc::ArrayView<const float> bias,
|
|
rtc::ArrayView<const float> weights,
|
|
rtc::FunctionView<float(float)> activation_function,
|
|
rtc::ArrayView<float> output) {
|
|
RTC_DCHECK_EQ(input.size(), input_size);
|
|
RTC_DCHECK_EQ(bias.size(), output_size);
|
|
RTC_DCHECK_EQ(weights.size(), input_size * output_size);
|
|
const size_t input_size_by_4 = input_size >> 2;
|
|
const size_t offset = input_size & ~3;
|
|
__m128 sum_wx_128;
|
|
const float* v = reinterpret_cast<const float*>(&sum_wx_128);
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
// Perform 128 bit vector operations.
|
|
sum_wx_128 = _mm_set1_ps(0);
|
|
const float* x_p = input.data();
|
|
const float* w_p = weights.data() + o * input_size;
|
|
for (size_t i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
|
|
sum_wx_128 = _mm_add_ps(sum_wx_128,
|
|
_mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
|
|
}
|
|
// Perform non-vector operations for any remaining items, sum up bias term
|
|
// and results from the vectorized code, and apply the activation function.
|
|
output[o] = activation_function(
|
|
std::inner_product(input.begin() + offset, input.end(),
|
|
weights.begin() + o * input_size + offset,
|
|
bias[o] + v[0] + v[1] + v[2] + v[3]));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
} // namespace
|
|
|
|
FullyConnectedLayer::FullyConnectedLayer(
|
|
const size_t input_size,
|
|
const size_t output_size,
|
|
const rtc::ArrayView<const int8_t> bias,
|
|
const rtc::ArrayView<const int8_t> weights,
|
|
rtc::FunctionView<float(float)> activation_function,
|
|
Optimization optimization)
|
|
: input_size_(input_size),
|
|
output_size_(output_size),
|
|
bias_(GetScaledParams(bias)),
|
|
weights_(GetPreprocessedFcWeights(weights, output_size)),
|
|
activation_function_(activation_function),
|
|
optimization_(optimization) {
|
|
RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
|
|
<< "Static over-allocation of fully-connected layers output vectors is "
|
|
"not sufficient.";
|
|
RTC_DCHECK_EQ(output_size_, bias_.size())
|
|
<< "Mismatching output size and bias terms array size.";
|
|
RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
|
|
<< "Mismatching input-output size and weight coefficients array size.";
|
|
}
|
|
|
|
FullyConnectedLayer::~FullyConnectedLayer() = default;
|
|
|
|
rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
|
|
return rtc::ArrayView<const float>(output_.data(), output_size_);
|
|
}
|
|
|
|
void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
|
switch (optimization_) {
|
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
|
case Optimization::kSse2:
|
|
ComputeFullyConnectedLayerOutputSse2(input_size_, output_size_, input,
|
|
bias_, weights_,
|
|
activation_function_, output_);
|
|
break;
|
|
#endif
|
|
#if defined(WEBRTC_HAS_NEON)
|
|
case Optimization::kNeon:
|
|
// TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
|
|
ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
|
|
weights_, activation_function_, output_);
|
|
break;
|
|
#endif
|
|
default:
|
|
ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
|
|
weights_, activation_function_, output_);
|
|
}
|
|
}
|
|
|
|
GatedRecurrentLayer::GatedRecurrentLayer(
|
|
const size_t input_size,
|
|
const size_t output_size,
|
|
const rtc::ArrayView<const int8_t> bias,
|
|
const rtc::ArrayView<const int8_t> weights,
|
|
const rtc::ArrayView<const int8_t> recurrent_weights,
|
|
Optimization optimization)
|
|
: input_size_(input_size),
|
|
output_size_(output_size),
|
|
bias_(GetPreprocessedGruTensor(bias, output_size)),
|
|
weights_(GetPreprocessedGruTensor(weights, output_size)),
|
|
recurrent_weights_(
|
|
GetPreprocessedGruTensor(recurrent_weights, output_size)),
|
|
optimization_(optimization) {
|
|
RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
|
|
<< "Static over-allocation of recurrent layers state vectors is not "
|
|
"sufficient.";
|
|
RTC_DCHECK_EQ(kNumGruGates * output_size_, bias_.size())
|
|
<< "Mismatching output size and bias terms array size.";
|
|
RTC_DCHECK_EQ(kNumGruGates * input_size_ * output_size_, weights_.size())
|
|
<< "Mismatching input-output size and weight coefficients array size.";
|
|
RTC_DCHECK_EQ(kNumGruGates * output_size_ * output_size_,
|
|
recurrent_weights_.size())
|
|
<< "Mismatching input-output size and recurrent weight coefficients array"
|
|
" size.";
|
|
Reset();
|
|
}
|
|
|
|
GatedRecurrentLayer::~GatedRecurrentLayer() = default;
|
|
|
|
rtc::ArrayView<const float> GatedRecurrentLayer::GetOutput() const {
|
|
return rtc::ArrayView<const float>(state_.data(), output_size_);
|
|
}
|
|
|
|
void GatedRecurrentLayer::Reset() {
|
|
state_.fill(0.f);
|
|
}
|
|
|
|
void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
|
switch (optimization_) {
|
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
|
case Optimization::kSse2:
|
|
// TODO(bugs.chromium.org/10480): Handle Optimization::kSse2.
|
|
ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
|
|
recurrent_weights_, bias_, state_);
|
|
break;
|
|
#endif
|
|
#if defined(WEBRTC_HAS_NEON)
|
|
case Optimization::kNeon:
|
|
// TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
|
|
ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
|
|
recurrent_weights_, bias_, state_);
|
|
break;
|
|
#endif
|
|
default:
|
|
ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
|
|
recurrent_weights_, bias_, state_);
|
|
}
|
|
}
|
|
|
|
RnnBasedVad::RnnBasedVad()
|
|
: input_layer_(kInputLayerInputSize,
|
|
kInputLayerOutputSize,
|
|
kInputDenseBias,
|
|
kInputDenseWeights,
|
|
TansigApproximated,
|
|
DetectOptimization()),
|
|
hidden_layer_(kInputLayerOutputSize,
|
|
kHiddenLayerOutputSize,
|
|
kHiddenGruBias,
|
|
kHiddenGruWeights,
|
|
kHiddenGruRecurrentWeights,
|
|
DetectOptimization()),
|
|
output_layer_(kHiddenLayerOutputSize,
|
|
kOutputLayerOutputSize,
|
|
kOutputDenseBias,
|
|
kOutputDenseWeights,
|
|
SigmoidApproximated,
|
|
DetectOptimization()) {
|
|
// Input-output chaining size checks.
|
|
RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
|
|
<< "The input and the hidden layers sizes do not match.";
|
|
RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size())
|
|
<< "The hidden and the output layers sizes do not match.";
|
|
}
|
|
|
|
RnnBasedVad::~RnnBasedVad() = default;
|
|
|
|
void RnnBasedVad::Reset() {
|
|
hidden_layer_.Reset();
|
|
}
|
|
|
|
float RnnBasedVad::ComputeVadProbability(
|
|
rtc::ArrayView<const float, kFeatureVectorSize> feature_vector,
|
|
bool is_silence) {
|
|
if (is_silence) {
|
|
Reset();
|
|
return 0.f;
|
|
}
|
|
input_layer_.ComputeOutput(feature_vector);
|
|
hidden_layer_.ComputeOutput(input_layer_.GetOutput());
|
|
output_layer_.ComputeOutput(hidden_layer_.GetOutput());
|
|
const auto vad_output = output_layer_.GetOutput();
|
|
return vad_output[0];
|
|
}
|
|
|
|
} // namespace rnn_vad
|
|
} // namespace webrtc
|