/* * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "modules/audio_processing/agc2/rnn_vad/rnn.h" // Defines WEBRTC_ARCH_X86_FAMILY, used below. #include "rtc_base/system/arch.h" #if defined(WEBRTC_HAS_NEON) #include #endif #if defined(WEBRTC_ARCH_X86_FAMILY) #include #endif #include #include #include #include #include "rtc_base/checks.h" #include "rtc_base/logging.h" #include "third_party/rnnoise/src/rnn_activations.h" #include "third_party/rnnoise/src/rnn_vad_weights.h" namespace webrtc { namespace rnn_vad { namespace { using rnnoise::kWeightsScale; using rnnoise::kInputLayerInputSize; static_assert(kFeatureVectorSize == kInputLayerInputSize, ""); using rnnoise::kInputDenseBias; using rnnoise::kInputDenseWeights; using rnnoise::kInputLayerOutputSize; static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits, "Increase kFullyConnectedLayersMaxUnits."); using rnnoise::kHiddenGruBias; using rnnoise::kHiddenGruRecurrentWeights; using rnnoise::kHiddenGruWeights; using rnnoise::kHiddenLayerOutputSize; static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits, "Increase kRecurrentLayersMaxUnits."); using rnnoise::kOutputDenseBias; using rnnoise::kOutputDenseWeights; using rnnoise::kOutputLayerOutputSize; static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits, "Increase kFullyConnectedLayersMaxUnits."); using rnnoise::SigmoidApproximated; using rnnoise::TansigApproximated; inline float RectifiedLinearUnit(float x) { return x < 0.f ? 0.f : x; } std::vector GetScaledParams(rtc::ArrayView params) { std::vector scaled_params(params.size()); std::transform(params.begin(), params.end(), scaled_params.begin(), [](int8_t x) -> float { return rnnoise::kWeightsScale * static_cast(x); }); return scaled_params; } // TODO(bugs.chromium.org/10480): Hard-code optimized layout and remove this // function to improve setup time. // Casts and scales |weights| and re-arranges the layout. std::vector GetPreprocessedFcWeights( rtc::ArrayView weights, size_t output_size) { if (output_size == 1) { return GetScaledParams(weights); } // Transpose, scale and cast. const size_t input_size = rtc::CheckedDivExact(weights.size(), output_size); std::vector w(weights.size()); for (size_t o = 0; o < output_size; ++o) { for (size_t i = 0; i < input_size; ++i) { w[o * input_size + i] = rnnoise::kWeightsScale * static_cast(weights[i * output_size + o]); } } return w; } constexpr size_t kNumGruGates = 3; // Update, reset, output. // TODO(bugs.chromium.org/10480): Hard-coded optimized layout and remove this // function to improve setup time. // Casts and scales |tensor_src| for a GRU layer and re-arranges the layout. // It works both for weights, recurrent weights and bias. std::vector GetPreprocessedGruTensor( rtc::ArrayView tensor_src, size_t output_size) { // Transpose, cast and scale. // |n| is the size of the first dimension of the 3-dim tensor |weights|. const size_t n = rtc::CheckedDivExact(tensor_src.size(), output_size * kNumGruGates); const size_t stride_src = kNumGruGates * output_size; const size_t stride_dst = n * output_size; std::vector tensor_dst(tensor_src.size()); for (size_t g = 0; g < kNumGruGates; ++g) { for (size_t o = 0; o < output_size; ++o) { for (size_t i = 0; i < n; ++i) { tensor_dst[g * stride_dst + o * n + i] = rnnoise::kWeightsScale * static_cast( tensor_src[i * stride_src + g * output_size + o]); } } } return tensor_dst; } void ComputeGruUpdateResetGates(size_t input_size, size_t output_size, rtc::ArrayView weights, rtc::ArrayView recurrent_weights, rtc::ArrayView bias, rtc::ArrayView input, rtc::ArrayView state, rtc::ArrayView gate) { for (size_t o = 0; o < output_size; ++o) { gate[o] = bias[o]; for (size_t i = 0; i < input_size; ++i) { gate[o] += input[i] * weights[o * input_size + i]; } for (size_t s = 0; s < output_size; ++s) { gate[o] += state[s] * recurrent_weights[o * output_size + s]; } gate[o] = SigmoidApproximated(gate[o]); } } void ComputeGruOutputGate(size_t input_size, size_t output_size, rtc::ArrayView weights, rtc::ArrayView recurrent_weights, rtc::ArrayView bias, rtc::ArrayView input, rtc::ArrayView state, rtc::ArrayView reset, rtc::ArrayView gate) { for (size_t o = 0; o < output_size; ++o) { gate[o] = bias[o]; for (size_t i = 0; i < input_size; ++i) { gate[o] += input[i] * weights[o * input_size + i]; } for (size_t s = 0; s < output_size; ++s) { gate[o] += state[s] * recurrent_weights[o * output_size + s] * reset[s]; } gate[o] = RectifiedLinearUnit(gate[o]); } } // Gated recurrent unit (GRU) layer un-optimized implementation. void ComputeGruLayerOutput(size_t input_size, size_t output_size, rtc::ArrayView input, rtc::ArrayView weights, rtc::ArrayView recurrent_weights, rtc::ArrayView bias, rtc::ArrayView state) { RTC_DCHECK_EQ(input_size, input.size()); // Stride and offset used to read parameter arrays. const size_t stride_in = input_size * output_size; const size_t stride_out = output_size * output_size; // Update gate. std::array update; ComputeGruUpdateResetGates( input_size, output_size, weights.subview(0, stride_in), recurrent_weights.subview(0, stride_out), bias.subview(0, output_size), input, state, update); // Reset gate. std::array reset; ComputeGruUpdateResetGates( input_size, output_size, weights.subview(stride_in, stride_in), recurrent_weights.subview(stride_out, stride_out), bias.subview(output_size, output_size), input, state, reset); // Output gate. std::array output; ComputeGruOutputGate( input_size, output_size, weights.subview(2 * stride_in, stride_in), recurrent_weights.subview(2 * stride_out, stride_out), bias.subview(2 * output_size, output_size), input, state, reset, output); // Update output through the update gates and update the state. for (size_t o = 0; o < output_size; ++o) { output[o] = update[o] * state[o] + (1.f - update[o]) * output[o]; state[o] = output[o]; } } // Fully connected layer un-optimized implementation. void ComputeFullyConnectedLayerOutput( size_t input_size, size_t output_size, rtc::ArrayView input, rtc::ArrayView bias, rtc::ArrayView weights, rtc::FunctionView activation_function, rtc::ArrayView output) { RTC_DCHECK_EQ(input.size(), input_size); RTC_DCHECK_EQ(bias.size(), output_size); RTC_DCHECK_EQ(weights.size(), input_size * output_size); for (size_t o = 0; o < output_size; ++o) { output[o] = bias[o]; // TODO(bugs.chromium.org/9076): Benchmark how different layouts for // |weights_| change the performance across different platforms. for (size_t i = 0; i < input_size; ++i) { output[o] += input[i] * weights[o * input_size + i]; } output[o] = activation_function(output[o]); } } #if defined(WEBRTC_ARCH_X86_FAMILY) // Fully connected layer SSE2 implementation. void ComputeFullyConnectedLayerOutputSse2( size_t input_size, size_t output_size, rtc::ArrayView input, rtc::ArrayView bias, rtc::ArrayView weights, rtc::FunctionView activation_function, rtc::ArrayView output) { RTC_DCHECK_EQ(input.size(), input_size); RTC_DCHECK_EQ(bias.size(), output_size); RTC_DCHECK_EQ(weights.size(), input_size * output_size); const size_t input_size_by_4 = input_size >> 2; const size_t offset = input_size & ~3; __m128 sum_wx_128; const float* v = reinterpret_cast(&sum_wx_128); for (size_t o = 0; o < output_size; ++o) { // Perform 128 bit vector operations. sum_wx_128 = _mm_set1_ps(0); const float* x_p = input.data(); const float* w_p = weights.data() + o * input_size; for (size_t i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) { sum_wx_128 = _mm_add_ps(sum_wx_128, _mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p))); } // Perform non-vector operations for any remaining items, sum up bias term // and results from the vectorized code, and apply the activation function. output[o] = activation_function( std::inner_product(input.begin() + offset, input.end(), weights.begin() + o * input_size + offset, bias[o] + v[0] + v[1] + v[2] + v[3])); } } #endif } // namespace FullyConnectedLayer::FullyConnectedLayer( const size_t input_size, const size_t output_size, const rtc::ArrayView bias, const rtc::ArrayView weights, rtc::FunctionView activation_function, Optimization optimization) : input_size_(input_size), output_size_(output_size), bias_(GetScaledParams(bias)), weights_(GetPreprocessedFcWeights(weights, output_size)), activation_function_(activation_function), optimization_(optimization) { RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits) << "Static over-allocation of fully-connected layers output vectors is " "not sufficient."; RTC_DCHECK_EQ(output_size_, bias_.size()) << "Mismatching output size and bias terms array size."; RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size()) << "Mismatching input-output size and weight coefficients array size."; } FullyConnectedLayer::~FullyConnectedLayer() = default; rtc::ArrayView FullyConnectedLayer::GetOutput() const { return rtc::ArrayView(output_.data(), output_size_); } void FullyConnectedLayer::ComputeOutput(rtc::ArrayView input) { switch (optimization_) { #if defined(WEBRTC_ARCH_X86_FAMILY) case Optimization::kSse2: ComputeFullyConnectedLayerOutputSse2(input_size_, output_size_, input, bias_, weights_, activation_function_, output_); break; #endif #if defined(WEBRTC_HAS_NEON) case Optimization::kNeon: // TODO(bugs.chromium.org/10480): Handle Optimization::kNeon. ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_, weights_, activation_function_, output_); break; #endif default: ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_, weights_, activation_function_, output_); } } GatedRecurrentLayer::GatedRecurrentLayer( const size_t input_size, const size_t output_size, const rtc::ArrayView bias, const rtc::ArrayView weights, const rtc::ArrayView recurrent_weights, Optimization optimization) : input_size_(input_size), output_size_(output_size), bias_(GetPreprocessedGruTensor(bias, output_size)), weights_(GetPreprocessedGruTensor(weights, output_size)), recurrent_weights_( GetPreprocessedGruTensor(recurrent_weights, output_size)), optimization_(optimization) { RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits) << "Static over-allocation of recurrent layers state vectors is not " "sufficient."; RTC_DCHECK_EQ(kNumGruGates * output_size_, bias_.size()) << "Mismatching output size and bias terms array size."; RTC_DCHECK_EQ(kNumGruGates * input_size_ * output_size_, weights_.size()) << "Mismatching input-output size and weight coefficients array size."; RTC_DCHECK_EQ(kNumGruGates * output_size_ * output_size_, recurrent_weights_.size()) << "Mismatching input-output size and recurrent weight coefficients array" " size."; Reset(); } GatedRecurrentLayer::~GatedRecurrentLayer() = default; rtc::ArrayView GatedRecurrentLayer::GetOutput() const { return rtc::ArrayView(state_.data(), output_size_); } void GatedRecurrentLayer::Reset() { state_.fill(0.f); } void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView input) { switch (optimization_) { #if defined(WEBRTC_ARCH_X86_FAMILY) case Optimization::kSse2: // TODO(bugs.chromium.org/10480): Handle Optimization::kSse2. ComputeGruLayerOutput(input_size_, output_size_, input, weights_, recurrent_weights_, bias_, state_); break; #endif #if defined(WEBRTC_HAS_NEON) case Optimization::kNeon: // TODO(bugs.chromium.org/10480): Handle Optimization::kNeon. ComputeGruLayerOutput(input_size_, output_size_, input, weights_, recurrent_weights_, bias_, state_); break; #endif default: ComputeGruLayerOutput(input_size_, output_size_, input, weights_, recurrent_weights_, bias_, state_); } } RnnBasedVad::RnnBasedVad() : input_layer_(kInputLayerInputSize, kInputLayerOutputSize, kInputDenseBias, kInputDenseWeights, TansigApproximated, DetectOptimization()), hidden_layer_(kInputLayerOutputSize, kHiddenLayerOutputSize, kHiddenGruBias, kHiddenGruWeights, kHiddenGruRecurrentWeights, DetectOptimization()), output_layer_(kHiddenLayerOutputSize, kOutputLayerOutputSize, kOutputDenseBias, kOutputDenseWeights, SigmoidApproximated, DetectOptimization()) { // Input-output chaining size checks. RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size()) << "The input and the hidden layers sizes do not match."; RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size()) << "The hidden and the output layers sizes do not match."; } RnnBasedVad::~RnnBasedVad() = default; void RnnBasedVad::Reset() { hidden_layer_.Reset(); } float RnnBasedVad::ComputeVadProbability( rtc::ArrayView feature_vector, bool is_silence) { if (is_silence) { Reset(); return 0.f; } input_layer_.ComputeOutput(feature_vector); hidden_layer_.ComputeOutput(input_layer_.GetOutput()); output_layer_.ComputeOutput(hidden_layer_.GetOutput()); const auto vad_output = output_layer_.GetOutput(); return vad_output[0]; } } // namespace rnn_vad } // namespace webrtc