webrtc-audio-processing/webrtc/modules/audio_processing/agc2/clipping_predictor.cc

/*
 *  Copyright (c) 2021 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "modules/audio_processing/agc2/clipping_predictor.h"

#include <algorithm>
#include <memory>

#include "common_audio/include/audio_util.h"
#include "modules/audio_processing/agc2/clipping_predictor_level_buffer.h"
#include "modules/audio_processing/agc2/gain_map_internal.h"
#include "rtc_base/checks.h"
#include "rtc_base/logging.h"
#include "rtc_base/numerics/safe_minmax.h"

namespace webrtc {
namespace {

constexpr int kClippingPredictorMaxGainChange = 15;

// Returns an input volume in the [`min_input_volume`, `max_input_volume`] range
// that reduces `gain_error_db`, which is a gain error estimated when
// `input_volume` was applied, according to a fixed gain map.
int ComputeVolumeUpdate(int gain_error_db,
                        int input_volume,
                        int min_input_volume,
                        int max_input_volume) {
  RTC_DCHECK_GE(input_volume, 0);
  RTC_DCHECK_LE(input_volume, max_input_volume);
  if (gain_error_db == 0) {
    return input_volume;
  }
  int new_volume = input_volume;
  if (gain_error_db > 0) {
    while (kGainMap[new_volume] - kGainMap[input_volume] < gain_error_db &&
           new_volume < max_input_volume) {
      ++new_volume;
    }
  } else {
    while (kGainMap[new_volume] - kGainMap[input_volume] > gain_error_db &&
           new_volume > min_input_volume) {
      --new_volume;
    }
  }
  return new_volume;
}

float ComputeCrestFactor(const ClippingPredictorLevelBuffer::Level& level) {
  const float crest_factor =
      FloatS16ToDbfs(level.max) - FloatS16ToDbfs(std::sqrt(level.average));
  return crest_factor;
}

// Crest factor-based clipping prediction and clipped level step estimation.
class ClippingEventPredictor : public ClippingPredictor {
 public:
  // ClippingEventPredictor with `num_channels` channels (limited to values
  // higher than zero); window size `window_length` and reference window size
  // `reference_window_length` (both referring to the number of frames in the
  // respective sliding windows and limited to values higher than zero);
  // reference window delay `reference_window_delay` (delay in frames, limited
  // to values zero and higher with an additional requirement of
  // `window_length` < `reference_window_length` + reference_window_delay`);
  // and an estimation peak threshold `clipping_threshold` and a crest factor
  // drop threshold `crest_factor_margin` (both in dB).
  ClippingEventPredictor(int num_channels,
                         int window_length,
                         int reference_window_length,
                         int reference_window_delay,
                         float clipping_threshold,
                         float crest_factor_margin)
      : window_length_(window_length),
        reference_window_length_(reference_window_length),
        reference_window_delay_(reference_window_delay),
        clipping_threshold_(clipping_threshold),
        crest_factor_margin_(crest_factor_margin) {
    RTC_DCHECK_GT(num_channels, 0);
    RTC_DCHECK_GT(window_length, 0);
    RTC_DCHECK_GT(reference_window_length, 0);
    RTC_DCHECK_GE(reference_window_delay, 0);
    RTC_DCHECK_GT(reference_window_length + reference_window_delay,
                  window_length);
    const int buffer_length = GetMinFramesProcessed();
    RTC_DCHECK_GT(buffer_length, 0);
    for (int i = 0; i < num_channels; ++i) {
      ch_buffers_.push_back(
          std::make_unique<ClippingPredictorLevelBuffer>(buffer_length));
    }
  }

  ClippingEventPredictor(const ClippingEventPredictor&) = delete;
  ClippingEventPredictor& operator=(const ClippingEventPredictor&) = delete;
  ~ClippingEventPredictor() {}

  void Reset() {
    const int num_channels = ch_buffers_.size();
    for (int i = 0; i < num_channels; ++i) {
      ch_buffers_[i]->Reset();
    }
  }

  // Analyzes a frame of audio and stores the framewise metrics in
  // `ch_buffers_`.
  void Analyze(const AudioFrameView<const float>& frame) {
    const int num_channels = frame.num_channels();
    RTC_DCHECK_EQ(num_channels, ch_buffers_.size());
    const int samples_per_channel = frame.samples_per_channel();
    RTC_DCHECK_GT(samples_per_channel, 0);
    for (int channel = 0; channel < num_channels; ++channel) {
      float sum_squares = 0.0f;
      float peak = 0.0f;
      for (const auto& sample : frame.channel(channel)) {
        sum_squares += sample * sample;
        peak = std::max(std::fabs(sample), peak);
      }
      ch_buffers_[channel]->Push(
          {sum_squares / static_cast<float>(samples_per_channel), peak});
    }
  }

  // Estimates the analog gain adjustment for channel `channel` using a
  // sliding window over the frame-wise metrics in `ch_buffers_`. Returns an
  // estimate for the clipped level step equal to `default_clipped_level_step_`
  // if at least `GetMinFramesProcessed()` frames have been processed since the
  // last reset and a clipping event is predicted. `level`, `min_mic_level`, and
  // `max_mic_level` are limited to [0, 255] and `default_step` to [1, 255].
  std::optional<int> EstimateClippedLevelStep(int channel,
                                              int level,
                                              int default_step,
                                              int min_mic_level,
                                              int max_mic_level) const {
    RTC_CHECK_GE(channel, 0);
    RTC_CHECK_LT(channel, ch_buffers_.size());
    RTC_DCHECK_GE(level, 0);
    RTC_DCHECK_LE(level, 255);
    RTC_DCHECK_GT(default_step, 0);
    RTC_DCHECK_LE(default_step, 255);
    RTC_DCHECK_GE(min_mic_level, 0);
    RTC_DCHECK_LE(min_mic_level, 255);
    RTC_DCHECK_GE(max_mic_level, 0);
    RTC_DCHECK_LE(max_mic_level, 255);
    if (level <= min_mic_level) {
      return std::nullopt;
    }
    if (PredictClippingEvent(channel)) {
      const int new_level =
          rtc::SafeClamp(level - default_step, min_mic_level, max_mic_level);
      const int step = level - new_level;
      if (step > 0) {
        return step;
      }
    }
    return std::nullopt;
  }

 private:
  int GetMinFramesProcessed() const {
    return reference_window_delay_ + reference_window_length_;
  }

  // Predicts clipping events based on the processed audio frames. Returns
  // true if a clipping event is likely.
  bool PredictClippingEvent(int channel) const {
    const auto metrics =
        ch_buffers_[channel]->ComputePartialMetrics(0, window_length_);
    if (!metrics.has_value() ||
        !(FloatS16ToDbfs(metrics.value().max) > clipping_threshold_)) {
      return false;
    }
    const auto reference_metrics = ch_buffers_[channel]->ComputePartialMetrics(
        reference_window_delay_, reference_window_length_);
    if (!reference_metrics.has_value()) {
      return false;
    }
    const float crest_factor = ComputeCrestFactor(metrics.value());
    const float reference_crest_factor =
        ComputeCrestFactor(reference_metrics.value());
    if (crest_factor < reference_crest_factor - crest_factor_margin_) {
      return true;
    }
    return false;
  }

  std::vector<std::unique_ptr<ClippingPredictorLevelBuffer>> ch_buffers_;
  const int window_length_;
  const int reference_window_length_;
  const int reference_window_delay_;
  const float clipping_threshold_;
  const float crest_factor_margin_;
};

// Performs crest factor-based clipping peak prediction.
class ClippingPeakPredictor : public ClippingPredictor {
 public:
  // Ctor. ClippingPeakPredictor with `num_channels` channels (limited to values
  // higher than zero); window size `window_length` and reference window size
  // `reference_window_length` (both referring to the number of frames in the
  // respective sliding windows and limited to values higher than zero);
  // reference window delay `reference_window_delay` (delay in frames, limited
  // to values zero and higher with an additional requirement of
  // `window_length` < `reference_window_length` + reference_window_delay`);
  // and a clipping prediction threshold `clipping_threshold` (in dB). Adaptive
  // clipped level step estimation is used if `adaptive_step_estimation` is
  // true.
  explicit ClippingPeakPredictor(int num_channels,
                                 int window_length,
                                 int reference_window_length,
                                 int reference_window_delay,
                                 int clipping_threshold,
                                 bool adaptive_step_estimation)
      : window_length_(window_length),
        reference_window_length_(reference_window_length),
        reference_window_delay_(reference_window_delay),
        clipping_threshold_(clipping_threshold),
        adaptive_step_estimation_(adaptive_step_estimation) {
    RTC_DCHECK_GT(num_channels, 0);
    RTC_DCHECK_GT(window_length, 0);
    RTC_DCHECK_GT(reference_window_length, 0);
    RTC_DCHECK_GE(reference_window_delay, 0);
    RTC_DCHECK_GT(reference_window_length + reference_window_delay,
                  window_length);
    const int buffer_length = GetMinFramesProcessed();
    RTC_DCHECK_GT(buffer_length, 0);
    for (int i = 0; i < num_channels; ++i) {
      ch_buffers_.push_back(
          std::make_unique<ClippingPredictorLevelBuffer>(buffer_length));
    }
  }

  ClippingPeakPredictor(const ClippingPeakPredictor&) = delete;
  ClippingPeakPredictor& operator=(const ClippingPeakPredictor&) = delete;
  ~ClippingPeakPredictor() {}

  void Reset() {
    const int num_channels = ch_buffers_.size();
    for (int i = 0; i < num_channels; ++i) {
      ch_buffers_[i]->Reset();
    }
  }

  // Analyzes a frame of audio and stores the framewise metrics in
  // `ch_buffers_`.
  void Analyze(const AudioFrameView<const float>& frame) {
    const int num_channels = frame.num_channels();
    RTC_DCHECK_EQ(num_channels, ch_buffers_.size());
    const int samples_per_channel = frame.samples_per_channel();
    RTC_DCHECK_GT(samples_per_channel, 0);
    for (int channel = 0; channel < num_channels; ++channel) {
      float sum_squares = 0.0f;
      float peak = 0.0f;
      for (const auto& sample : frame.channel(channel)) {
        sum_squares += sample * sample;
        peak = std::max(std::fabs(sample), peak);
      }
      ch_buffers_[channel]->Push(
          {sum_squares / static_cast<float>(samples_per_channel), peak});
    }
  }

  // Estimates the analog gain adjustment for channel `channel` using a
  // sliding window over the frame-wise metrics in `ch_buffers_`. Returns an
  // estimate for the clipped level step (equal to
  // `default_clipped_level_step_` if `adaptive_estimation_` is false) if at
  // least `GetMinFramesProcessed()` frames have been processed since the last
  // reset and a clipping event is predicted. `level`, `min_mic_level`, and
  // `max_mic_level` are limited to [0, 255] and `default_step` to [1, 255].
  std::optional<int> EstimateClippedLevelStep(int channel,
                                              int level,
                                              int default_step,
                                              int min_mic_level,
                                              int max_mic_level) const {
    RTC_DCHECK_GE(channel, 0);
    RTC_DCHECK_LT(channel, ch_buffers_.size());
    RTC_DCHECK_GE(level, 0);
    RTC_DCHECK_LE(level, 255);
    RTC_DCHECK_GT(default_step, 0);
    RTC_DCHECK_LE(default_step, 255);
    RTC_DCHECK_GE(min_mic_level, 0);
    RTC_DCHECK_LE(min_mic_level, 255);
    RTC_DCHECK_GE(max_mic_level, 0);
    RTC_DCHECK_LE(max_mic_level, 255);
    if (level <= min_mic_level) {
      return std::nullopt;
    }
    std::optional<float> estimate_db = EstimatePeakValue(channel);
    if (estimate_db.has_value() && estimate_db.value() > clipping_threshold_) {
      int step = 0;
      if (!adaptive_step_estimation_) {
        step = default_step;
      } else {
        const int estimated_gain_change =
            rtc::SafeClamp(-static_cast<int>(std::ceil(estimate_db.value())),
                           -kClippingPredictorMaxGainChange, 0);
        step =
            std::max(level - ComputeVolumeUpdate(estimated_gain_change, level,
                                                 min_mic_level, max_mic_level),
                     default_step);
      }
      const int new_level =
          rtc::SafeClamp(level - step, min_mic_level, max_mic_level);
      if (level > new_level) {
        return level - new_level;
      }
    }
    return std::nullopt;
  }

 private:
  int GetMinFramesProcessed() {
    return reference_window_delay_ + reference_window_length_;
  }

  // Predicts clipping sample peaks based on the processed audio frames.
  // Returns the estimated peak value if clipping is predicted. Otherwise
  // returns std::nullopt.
  std::optional<float> EstimatePeakValue(int channel) const {
    const auto reference_metrics = ch_buffers_[channel]->ComputePartialMetrics(
        reference_window_delay_, reference_window_length_);
    if (!reference_metrics.has_value()) {
      return std::nullopt;
    }
    const auto metrics =
        ch_buffers_[channel]->ComputePartialMetrics(0, window_length_);
    if (!metrics.has_value() ||
        !(FloatS16ToDbfs(metrics.value().max) > clipping_threshold_)) {
      return std::nullopt;
    }
    const float reference_crest_factor =
        ComputeCrestFactor(reference_metrics.value());
    const float& mean_squares = metrics.value().average;
    const float projected_peak =
        reference_crest_factor + FloatS16ToDbfs(std::sqrt(mean_squares));
    return projected_peak;
  }

  std::vector<std::unique_ptr<ClippingPredictorLevelBuffer>> ch_buffers_;
  const int window_length_;
  const int reference_window_length_;
  const int reference_window_delay_;
  const int clipping_threshold_;
  const bool adaptive_step_estimation_;
};

}  // namespace

std::unique_ptr<ClippingPredictor> CreateClippingPredictor(
    int num_channels,
    const AudioProcessing::Config::GainController1::AnalogGainController::
        ClippingPredictor& config) {
  if (!config.enabled) {
    RTC_LOG(LS_INFO) << "[AGC2] Clipping prediction disabled.";
    return nullptr;
  }
  RTC_LOG(LS_INFO) << "[AGC2] Clipping prediction enabled.";
  using ClippingPredictorMode = AudioProcessing::Config::GainController1::
      AnalogGainController::ClippingPredictor::Mode;
  switch (config.mode) {
    case ClippingPredictorMode::kClippingEventPrediction:
      return std::make_unique<ClippingEventPredictor>(
          num_channels, config.window_length, config.reference_window_length,
          config.reference_window_delay, config.clipping_threshold,
          config.crest_factor_margin);
    case ClippingPredictorMode::kAdaptiveStepClippingPeakPrediction:
      return std::make_unique<ClippingPeakPredictor>(
          num_channels, config.window_length, config.reference_window_length,
          config.reference_window_delay, config.clipping_threshold,
          /*adaptive_step_estimation=*/true);
    case ClippingPredictorMode::kFixedStepClippingPeakPrediction:
      return std::make_unique<ClippingPeakPredictor>(
          num_channels, config.window_length, config.reference_window_length,
          config.reference_window_delay, config.clipping_threshold,
          /*adaptive_step_estimation=*/false);
  }
  RTC_DCHECK_NOTREACHED();
}

}  // namespace webrtc