Update audio_processing module
Corresponds to upstream commit 524e9b043e7e86fd72353b987c9d5f6a1ebf83e1 Update notes: * Pull in third party license file * Replace .gypi files with BUILD.gn to keep track of what changes upstream * Bunch of new filse pulled in as dependencies * Won't build yet due to changes needed on top of these
This commit is contained in:
@ -1,16 +0,0 @@
|
||||
noinst_LTLIBRARIES = libaec.la
|
||||
|
||||
libaec_la_SOURCES = interface/echo_cancellation.h \
|
||||
echo_cancellation.c \
|
||||
aec_core.h \
|
||||
aec_core.c \
|
||||
aec_core_sse2.c \
|
||||
aec_rdft.h \
|
||||
aec_rdft.c \
|
||||
aec_rdft_sse2.c \
|
||||
resampler.h \
|
||||
resampler.c
|
||||
libaec_la_CFLAGS = $(AM_CFLAGS) $(COMMON_CFLAGS) \
|
||||
-I$(top_srcdir)/src/common_audio/signal_processing_library/main/interface \
|
||||
-I$(top_srcdir)/src/system_wrappers/interface \
|
||||
-I$(top_srcdir)/src/modules/audio_processing/utility
|
@ -1,40 +0,0 @@
|
||||
# Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
#
|
||||
# Use of this source code is governed by a BSD-style license
|
||||
# that can be found in the LICENSE file in the root of the source
|
||||
# tree. An additional intellectual property rights grant can be found
|
||||
# in the file PATENTS. All contributing project authors may
|
||||
# be found in the AUTHORS file in the root of the source tree.
|
||||
|
||||
{
|
||||
'targets': [
|
||||
{
|
||||
'target_name': 'aec',
|
||||
'type': '<(library)',
|
||||
'dependencies': [
|
||||
'<(webrtc_root)/common_audio/common_audio.gyp:spl',
|
||||
'apm_util'
|
||||
],
|
||||
'include_dirs': [
|
||||
'interface',
|
||||
],
|
||||
'direct_dependent_settings': {
|
||||
'include_dirs': [
|
||||
'interface',
|
||||
],
|
||||
},
|
||||
'sources': [
|
||||
'interface/echo_cancellation.h',
|
||||
'echo_cancellation.c',
|
||||
'aec_core.h',
|
||||
'aec_core.c',
|
||||
'aec_core_sse2.c',
|
||||
'aec_rdft.h',
|
||||
'aec_rdft.c',
|
||||
'aec_rdft_sse2.c',
|
||||
'resampler.h',
|
||||
'resampler.c',
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
32
webrtc/modules/audio_processing/aec/aec_common.h
Normal file
32
webrtc/modules/audio_processing/aec/aec_common.h
Normal file
@ -0,0 +1,32 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
|
||||
|
||||
#include "webrtc/typedefs.h"
|
||||
|
||||
#ifdef _MSC_VER /* visual c++ */
|
||||
#define ALIGN16_BEG __declspec(align(16))
|
||||
#define ALIGN16_END
|
||||
#else /* gcc or icc */
|
||||
#define ALIGN16_BEG
|
||||
#define ALIGN16_END __attribute__((aligned(16)))
|
||||
#endif
|
||||
|
||||
extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_sqrtHanning[65];
|
||||
extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_weightCurve[65];
|
||||
extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_overDriveCurve[65];
|
||||
extern const float WebRtcAec_kExtendedSmoothingCoefficients[2][2];
|
||||
extern const float WebRtcAec_kNormalSmoothingCoefficients[2][2];
|
||||
extern const float WebRtcAec_kMinFarendPSD;
|
||||
|
||||
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
@ -12,29 +12,18 @@
|
||||
* Specifies the interface for the AEC core.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include "signal_processing_library.h"
|
||||
#include "typedefs.h"
|
||||
|
||||
//#define AEC_DEBUG // for recording files
|
||||
#include "webrtc/typedefs.h"
|
||||
|
||||
#define FRAME_LEN 80
|
||||
#define PART_LEN 64 // Length of partition
|
||||
#define PART_LEN1 (PART_LEN + 1) // Unique fft coefficients
|
||||
#define PART_LEN2 (PART_LEN * 2) // Length of partition * 2
|
||||
#define NR_PART 12 // Number of partitions
|
||||
#define FILT_LEN (PART_LEN * NR_PART) // Filter length
|
||||
#define FILT_LEN2 (FILT_LEN * 2) // Double filter length
|
||||
#define FAR_BUF_LEN (FILT_LEN2 * 2)
|
||||
#define PREF_BAND_SIZE 24
|
||||
|
||||
#define BLOCKL_MAX FRAME_LEN
|
||||
// Maximum delay in fixed point delay estimator, used for logging
|
||||
enum {kMaxDelay = 100};
|
||||
#define PART_LEN 64 // Length of partition
|
||||
#define PART_LEN1 (PART_LEN + 1) // Unique fft coefficients
|
||||
#define PART_LEN2 (PART_LEN * 2) // Length of partition * 2
|
||||
#define NUM_HIGH_BANDS_MAX 2 // Max number of high bands
|
||||
|
||||
typedef float complex_t[2];
|
||||
// For performance reasons, some arrays of complex numbers are replaced by twice
|
||||
@ -46,136 +35,95 @@ typedef float complex_t[2];
|
||||
// compile time.
|
||||
|
||||
// Metrics
|
||||
enum {offsetLevel = -100};
|
||||
enum {
|
||||
kOffsetLevel = -100
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
float sfrsum;
|
||||
int sfrcounter;
|
||||
float framelevel;
|
||||
float frsum;
|
||||
int frcounter;
|
||||
float minlevel;
|
||||
float averagelevel;
|
||||
} power_level_t;
|
||||
typedef struct Stats {
|
||||
float instant;
|
||||
float average;
|
||||
float min;
|
||||
float max;
|
||||
float sum;
|
||||
float hisum;
|
||||
float himean;
|
||||
int counter;
|
||||
int hicounter;
|
||||
} Stats;
|
||||
|
||||
typedef struct {
|
||||
float instant;
|
||||
float average;
|
||||
float min;
|
||||
float max;
|
||||
float sum;
|
||||
float hisum;
|
||||
float himean;
|
||||
int counter;
|
||||
int hicounter;
|
||||
} stats_t;
|
||||
typedef struct AecCore AecCore;
|
||||
|
||||
typedef struct {
|
||||
int farBufWritePos, farBufReadPos;
|
||||
|
||||
int knownDelay;
|
||||
int inSamples, outSamples;
|
||||
int delayEstCtr;
|
||||
|
||||
void *farFrBuf, *nearFrBuf, *outFrBuf;
|
||||
|
||||
void *nearFrBufH;
|
||||
void *outFrBufH;
|
||||
|
||||
float xBuf[PART_LEN2]; // farend
|
||||
float dBuf[PART_LEN2]; // nearend
|
||||
float eBuf[PART_LEN2]; // error
|
||||
|
||||
float dBufH[PART_LEN2]; // nearend
|
||||
|
||||
float xPow[PART_LEN1];
|
||||
float dPow[PART_LEN1];
|
||||
float dMinPow[PART_LEN1];
|
||||
float dInitMinPow[PART_LEN1];
|
||||
float *noisePow;
|
||||
|
||||
float xfBuf[2][NR_PART * PART_LEN1]; // farend fft buffer
|
||||
float wfBuf[2][NR_PART * PART_LEN1]; // filter fft
|
||||
complex_t sde[PART_LEN1]; // cross-psd of nearend and error
|
||||
complex_t sxd[PART_LEN1]; // cross-psd of farend and nearend
|
||||
complex_t xfwBuf[NR_PART * PART_LEN1]; // farend windowed fft buffer
|
||||
|
||||
float sx[PART_LEN1], sd[PART_LEN1], se[PART_LEN1]; // far, near and error psd
|
||||
float hNs[PART_LEN1];
|
||||
float hNlFbMin, hNlFbLocalMin;
|
||||
float hNlXdAvgMin;
|
||||
int hNlNewMin, hNlMinCtr;
|
||||
float overDrive, overDriveSm;
|
||||
float targetSupp, minOverDrive;
|
||||
float outBuf[PART_LEN];
|
||||
int delayIdx;
|
||||
|
||||
short stNearState, echoState;
|
||||
short divergeState;
|
||||
|
||||
int xfBufBlockPos;
|
||||
|
||||
short farBuf[FILT_LEN2 * 2];
|
||||
|
||||
short mult; // sampling frequency multiple
|
||||
int sampFreq;
|
||||
WebRtc_UWord32 seed;
|
||||
|
||||
float mu; // stepsize
|
||||
float errThresh; // error threshold
|
||||
|
||||
int noiseEstCtr;
|
||||
|
||||
power_level_t farlevel;
|
||||
power_level_t nearlevel;
|
||||
power_level_t linoutlevel;
|
||||
power_level_t nlpoutlevel;
|
||||
|
||||
int metricsMode;
|
||||
int stateCounter;
|
||||
stats_t erl;
|
||||
stats_t erle;
|
||||
stats_t aNlp;
|
||||
stats_t rerl;
|
||||
|
||||
// Quantities to control H band scaling for SWB input
|
||||
int freq_avg_ic; //initial bin for averaging nlp gain
|
||||
int flag_Hband_cn; //for comfort noise
|
||||
float cn_scale_Hband; //scale for comfort noise in H band
|
||||
|
||||
int delay_histogram[kMaxDelay];
|
||||
int delay_logging_enabled;
|
||||
void* delay_estimator;
|
||||
|
||||
#ifdef AEC_DEBUG
|
||||
FILE *farFile;
|
||||
FILE *nearFile;
|
||||
FILE *outFile;
|
||||
FILE *outLpFile;
|
||||
#endif
|
||||
} aec_t;
|
||||
|
||||
typedef void (*WebRtcAec_FilterFar_t)(aec_t *aec, float yf[2][PART_LEN1]);
|
||||
extern WebRtcAec_FilterFar_t WebRtcAec_FilterFar;
|
||||
typedef void (*WebRtcAec_ScaleErrorSignal_t)(aec_t *aec, float ef[2][PART_LEN1]);
|
||||
extern WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal;
|
||||
typedef void (*WebRtcAec_FilterAdaptation_t)
|
||||
(aec_t *aec, float *fft, float ef[2][PART_LEN1]);
|
||||
extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
|
||||
typedef void (*WebRtcAec_OverdriveAndSuppress_t)
|
||||
(aec_t *aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]);
|
||||
extern WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;
|
||||
|
||||
int WebRtcAec_CreateAec(aec_t **aec);
|
||||
int WebRtcAec_FreeAec(aec_t *aec);
|
||||
int WebRtcAec_InitAec(aec_t *aec, int sampFreq);
|
||||
AecCore* WebRtcAec_CreateAec(); // Returns NULL on error.
|
||||
void WebRtcAec_FreeAec(AecCore* aec);
|
||||
int WebRtcAec_InitAec(AecCore* aec, int sampFreq);
|
||||
void WebRtcAec_InitAec_SSE2(void);
|
||||
#if defined(MIPS_FPU_LE)
|
||||
void WebRtcAec_InitAec_mips(void);
|
||||
#endif
|
||||
#if defined(WEBRTC_DETECT_NEON) || defined(WEBRTC_HAS_NEON)
|
||||
void WebRtcAec_InitAec_neon(void);
|
||||
#endif
|
||||
|
||||
void WebRtcAec_InitMetrics(aec_t *aec);
|
||||
void WebRtcAec_ProcessFrame(aec_t *aec, const short *farend,
|
||||
const short *nearend, const short *nearendH,
|
||||
short *out, short *outH,
|
||||
int knownDelay);
|
||||
void WebRtcAec_BufferFarendPartition(AecCore* aec, const float* farend);
|
||||
void WebRtcAec_ProcessFrames(AecCore* aec,
|
||||
const float* const* nearend,
|
||||
size_t num_bands,
|
||||
size_t num_samples,
|
||||
int knownDelay,
|
||||
float* const* out);
|
||||
|
||||
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
|
||||
// A helper function to call WebRtc_MoveReadPtr() for all far-end buffers.
|
||||
// Returns the number of elements moved, and adjusts |system_delay| by the
|
||||
// corresponding amount in ms.
|
||||
int WebRtcAec_MoveFarReadPtr(AecCore* aec, int elements);
|
||||
|
||||
// Calculates the median, standard deviation and amount of poor values among the
|
||||
// delay estimates aggregated up to the first call to the function. After that
|
||||
// first call the metrics are aggregated and updated every second. With poor
|
||||
// values we mean values that most likely will cause the AEC to perform poorly.
|
||||
// TODO(bjornv): Consider changing tests and tools to handle constant
|
||||
// constant aggregation window throughout the session instead.
|
||||
int WebRtcAec_GetDelayMetricsCore(AecCore* self, int* median, int* std,
|
||||
float* fraction_poor_delays);
|
||||
|
||||
// Returns the echo state (1: echo, 0: no echo).
|
||||
int WebRtcAec_echo_state(AecCore* self);
|
||||
|
||||
// Gets statistics of the echo metrics ERL, ERLE, A_NLP.
|
||||
void WebRtcAec_GetEchoStats(AecCore* self,
|
||||
Stats* erl,
|
||||
Stats* erle,
|
||||
Stats* a_nlp);
|
||||
#ifdef WEBRTC_AEC_DEBUG_DUMP
|
||||
void* WebRtcAec_far_time_buf(AecCore* self);
|
||||
#endif
|
||||
|
||||
// Sets local configuration modes.
|
||||
void WebRtcAec_SetConfigCore(AecCore* self,
|
||||
int nlp_mode,
|
||||
int metrics_mode,
|
||||
int delay_logging);
|
||||
|
||||
// Non-zero enables, zero disables.
|
||||
void WebRtcAec_enable_delay_agnostic(AecCore* self, int enable);
|
||||
|
||||
// Returns non-zero if delay agnostic (i.e., signal based delay estimation) is
|
||||
// enabled and zero if disabled.
|
||||
int WebRtcAec_delay_agnostic_enabled(AecCore* self);
|
||||
|
||||
// Enables or disables extended filter mode. Non-zero enables, zero disables.
|
||||
void WebRtcAec_enable_extended_filter(AecCore* self, int enable);
|
||||
|
||||
// Returns non-zero if extended filter mode is enabled and zero if disabled.
|
||||
int WebRtcAec_extended_filter_enabled(AecCore* self);
|
||||
|
||||
// Returns the current |system_delay|, i.e., the buffered difference between
|
||||
// far-end and near-end.
|
||||
int WebRtcAec_system_delay(AecCore* self);
|
||||
|
||||
// Sets the |system_delay| to |value|. Note that if the value is changed
|
||||
// improperly, there can be a performance regression. So it should be used with
|
||||
// care.
|
||||
void WebRtcAec_SetSystemDelay(AecCore* self, int delay);
|
||||
|
||||
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_
|
||||
|
202
webrtc/modules/audio_processing/aec/aec_core_internal.h
Normal file
202
webrtc/modules/audio_processing/aec/aec_core_internal.h
Normal file
@ -0,0 +1,202 @@
|
||||
/*
|
||||
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
|
||||
|
||||
#include "webrtc/common_audio/ring_buffer.h"
|
||||
#include "webrtc/common_audio/wav_file.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_common.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_core.h"
|
||||
#include "webrtc/typedefs.h"
|
||||
|
||||
// Number of partitions for the extended filter mode. The first one is an enum
|
||||
// to be used in array declarations, as it represents the maximum filter length.
|
||||
enum {
|
||||
kExtendedNumPartitions = 32
|
||||
};
|
||||
static const int kNormalNumPartitions = 12;
|
||||
|
||||
// Delay estimator constants, used for logging and delay compensation if
|
||||
// if reported delays are disabled.
|
||||
enum {
|
||||
kLookaheadBlocks = 15
|
||||
};
|
||||
enum {
|
||||
// 500 ms for 16 kHz which is equivalent with the limit of reported delays.
|
||||
kHistorySizeBlocks = 125
|
||||
};
|
||||
|
||||
// Extended filter adaptation parameters.
|
||||
// TODO(ajm): No narrowband tuning yet.
|
||||
static const float kExtendedMu = 0.4f;
|
||||
static const float kExtendedErrorThreshold = 1.0e-6f;
|
||||
|
||||
typedef struct PowerLevel {
|
||||
float sfrsum;
|
||||
int sfrcounter;
|
||||
float framelevel;
|
||||
float frsum;
|
||||
int frcounter;
|
||||
float minlevel;
|
||||
float averagelevel;
|
||||
} PowerLevel;
|
||||
|
||||
struct AecCore {
|
||||
int farBufWritePos, farBufReadPos;
|
||||
|
||||
int knownDelay;
|
||||
int inSamples, outSamples;
|
||||
int delayEstCtr;
|
||||
|
||||
RingBuffer* nearFrBuf;
|
||||
RingBuffer* outFrBuf;
|
||||
|
||||
RingBuffer* nearFrBufH[NUM_HIGH_BANDS_MAX];
|
||||
RingBuffer* outFrBufH[NUM_HIGH_BANDS_MAX];
|
||||
|
||||
float dBuf[PART_LEN2]; // nearend
|
||||
float eBuf[PART_LEN2]; // error
|
||||
|
||||
float dBufH[NUM_HIGH_BANDS_MAX][PART_LEN2]; // nearend
|
||||
|
||||
float xPow[PART_LEN1];
|
||||
float dPow[PART_LEN1];
|
||||
float dMinPow[PART_LEN1];
|
||||
float dInitMinPow[PART_LEN1];
|
||||
float* noisePow;
|
||||
|
||||
float xfBuf[2][kExtendedNumPartitions * PART_LEN1]; // farend fft buffer
|
||||
float wfBuf[2][kExtendedNumPartitions * PART_LEN1]; // filter fft
|
||||
complex_t sde[PART_LEN1]; // cross-psd of nearend and error
|
||||
complex_t sxd[PART_LEN1]; // cross-psd of farend and nearend
|
||||
// Farend windowed fft buffer.
|
||||
complex_t xfwBuf[kExtendedNumPartitions * PART_LEN1];
|
||||
|
||||
float sx[PART_LEN1], sd[PART_LEN1], se[PART_LEN1]; // far, near, error psd
|
||||
float hNs[PART_LEN1];
|
||||
float hNlFbMin, hNlFbLocalMin;
|
||||
float hNlXdAvgMin;
|
||||
int hNlNewMin, hNlMinCtr;
|
||||
float overDrive, overDriveSm;
|
||||
int nlp_mode;
|
||||
float outBuf[PART_LEN];
|
||||
int delayIdx;
|
||||
|
||||
short stNearState, echoState;
|
||||
short divergeState;
|
||||
|
||||
int xfBufBlockPos;
|
||||
|
||||
RingBuffer* far_buf;
|
||||
RingBuffer* far_buf_windowed;
|
||||
int system_delay; // Current system delay buffered in AEC.
|
||||
|
||||
int mult; // sampling frequency multiple
|
||||
int sampFreq;
|
||||
size_t num_bands;
|
||||
uint32_t seed;
|
||||
|
||||
float normal_mu; // stepsize
|
||||
float normal_error_threshold; // error threshold
|
||||
|
||||
int noiseEstCtr;
|
||||
|
||||
PowerLevel farlevel;
|
||||
PowerLevel nearlevel;
|
||||
PowerLevel linoutlevel;
|
||||
PowerLevel nlpoutlevel;
|
||||
|
||||
int metricsMode;
|
||||
int stateCounter;
|
||||
Stats erl;
|
||||
Stats erle;
|
||||
Stats aNlp;
|
||||
Stats rerl;
|
||||
|
||||
// Quantities to control H band scaling for SWB input
|
||||
int freq_avg_ic; // initial bin for averaging nlp gain
|
||||
int flag_Hband_cn; // for comfort noise
|
||||
float cn_scale_Hband; // scale for comfort noise in H band
|
||||
|
||||
int delay_metrics_delivered;
|
||||
int delay_histogram[kHistorySizeBlocks];
|
||||
int num_delay_values;
|
||||
int delay_median;
|
||||
int delay_std;
|
||||
float fraction_poor_delays;
|
||||
int delay_logging_enabled;
|
||||
void* delay_estimator_farend;
|
||||
void* delay_estimator;
|
||||
// Variables associated with delay correction through signal based delay
|
||||
// estimation feedback.
|
||||
int signal_delay_correction;
|
||||
int previous_delay;
|
||||
int delay_correction_count;
|
||||
int shift_offset;
|
||||
float delay_quality_threshold;
|
||||
int frame_count;
|
||||
|
||||
// 0 = delay agnostic mode (signal based delay correction) disabled.
|
||||
// Otherwise enabled.
|
||||
int delay_agnostic_enabled;
|
||||
// 1 = extended filter mode enabled, 0 = disabled.
|
||||
int extended_filter_enabled;
|
||||
// Runtime selection of number of filter partitions.
|
||||
int num_partitions;
|
||||
|
||||
#ifdef WEBRTC_AEC_DEBUG_DUMP
|
||||
// Sequence number of this AEC instance, so that different instances can
|
||||
// choose different dump file names.
|
||||
int instance_index;
|
||||
|
||||
// Number of times we've restarted dumping; used to pick new dump file names
|
||||
// each time.
|
||||
int debug_dump_count;
|
||||
|
||||
RingBuffer* far_time_buf;
|
||||
rtc_WavWriter* farFile;
|
||||
rtc_WavWriter* nearFile;
|
||||
rtc_WavWriter* outFile;
|
||||
rtc_WavWriter* outLinearFile;
|
||||
FILE* e_fft_file;
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef void (*WebRtcAecFilterFar)(AecCore* aec, float yf[2][PART_LEN1]);
|
||||
extern WebRtcAecFilterFar WebRtcAec_FilterFar;
|
||||
typedef void (*WebRtcAecScaleErrorSignal)(AecCore* aec, float ef[2][PART_LEN1]);
|
||||
extern WebRtcAecScaleErrorSignal WebRtcAec_ScaleErrorSignal;
|
||||
typedef void (*WebRtcAecFilterAdaptation)(AecCore* aec,
|
||||
float* fft,
|
||||
float ef[2][PART_LEN1]);
|
||||
extern WebRtcAecFilterAdaptation WebRtcAec_FilterAdaptation;
|
||||
typedef void (*WebRtcAecOverdriveAndSuppress)(AecCore* aec,
|
||||
float hNl[PART_LEN1],
|
||||
const float hNlFb,
|
||||
float efw[2][PART_LEN1]);
|
||||
extern WebRtcAecOverdriveAndSuppress WebRtcAec_OverdriveAndSuppress;
|
||||
|
||||
typedef void (*WebRtcAecComfortNoise)(AecCore* aec,
|
||||
float efw[2][PART_LEN1],
|
||||
complex_t* comfortNoiseHband,
|
||||
const float* noisePow,
|
||||
const float* lambda);
|
||||
extern WebRtcAecComfortNoise WebRtcAec_ComfortNoise;
|
||||
|
||||
typedef void (*WebRtcAecSubBandCoherence)(AecCore* aec,
|
||||
float efw[2][PART_LEN1],
|
||||
float xfw[2][PART_LEN1],
|
||||
float* fft,
|
||||
float* cohde,
|
||||
float* cohxd);
|
||||
extern WebRtcAecSubBandCoherence WebRtcAec_SubbandCoherence;
|
||||
|
||||
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
|
774
webrtc/modules/audio_processing/aec/aec_core_mips.c
Normal file
774
webrtc/modules/audio_processing/aec/aec_core_mips.c
Normal file
@ -0,0 +1,774 @@
|
||||
/*
|
||||
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The core AEC algorithm, which is presented with time-aligned signals.
|
||||
*/
|
||||
|
||||
#include "webrtc/modules/audio_processing/aec/aec_core.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
|
||||
|
||||
static const int flagHbandCn = 1; // flag for adding comfort noise in H band
|
||||
extern const float WebRtcAec_weightCurve[65];
|
||||
extern const float WebRtcAec_overDriveCurve[65];
|
||||
|
||||
void WebRtcAec_ComfortNoise_mips(AecCore* aec,
|
||||
float efw[2][PART_LEN1],
|
||||
complex_t* comfortNoiseHband,
|
||||
const float* noisePow,
|
||||
const float* lambda) {
|
||||
int i, num;
|
||||
float rand[PART_LEN];
|
||||
float noise, noiseAvg, tmp, tmpAvg;
|
||||
int16_t randW16[PART_LEN];
|
||||
complex_t u[PART_LEN1];
|
||||
|
||||
const float pi2 = 6.28318530717959f;
|
||||
const float pi2t = pi2 / 32768;
|
||||
|
||||
// Generate a uniform random array on [0 1]
|
||||
WebRtcSpl_RandUArray(randW16, PART_LEN, &aec->seed);
|
||||
|
||||
int16_t* randWptr = randW16;
|
||||
float randTemp, randTemp2, randTemp3, randTemp4;
|
||||
int32_t tmp1s, tmp2s, tmp3s, tmp4s;
|
||||
|
||||
for (i = 0; i < PART_LEN; i+=4) {
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"lh %[tmp1s], 0(%[randWptr]) \n\t"
|
||||
"lh %[tmp2s], 2(%[randWptr]) \n\t"
|
||||
"lh %[tmp3s], 4(%[randWptr]) \n\t"
|
||||
"lh %[tmp4s], 6(%[randWptr]) \n\t"
|
||||
"mtc1 %[tmp1s], %[randTemp] \n\t"
|
||||
"mtc1 %[tmp2s], %[randTemp2] \n\t"
|
||||
"mtc1 %[tmp3s], %[randTemp3] \n\t"
|
||||
"mtc1 %[tmp4s], %[randTemp4] \n\t"
|
||||
"cvt.s.w %[randTemp], %[randTemp] \n\t"
|
||||
"cvt.s.w %[randTemp2], %[randTemp2] \n\t"
|
||||
"cvt.s.w %[randTemp3], %[randTemp3] \n\t"
|
||||
"cvt.s.w %[randTemp4], %[randTemp4] \n\t"
|
||||
"addiu %[randWptr], %[randWptr], 8 \n\t"
|
||||
"mul.s %[randTemp], %[randTemp], %[pi2t] \n\t"
|
||||
"mul.s %[randTemp2], %[randTemp2], %[pi2t] \n\t"
|
||||
"mul.s %[randTemp3], %[randTemp3], %[pi2t] \n\t"
|
||||
"mul.s %[randTemp4], %[randTemp4], %[pi2t] \n\t"
|
||||
".set pop \n\t"
|
||||
: [randWptr] "+r" (randWptr), [randTemp] "=&f" (randTemp),
|
||||
[randTemp2] "=&f" (randTemp2), [randTemp3] "=&f" (randTemp3),
|
||||
[randTemp4] "=&f" (randTemp4), [tmp1s] "=&r" (tmp1s),
|
||||
[tmp2s] "=&r" (tmp2s), [tmp3s] "=&r" (tmp3s),
|
||||
[tmp4s] "=&r" (tmp4s)
|
||||
: [pi2t] "f" (pi2t)
|
||||
: "memory"
|
||||
);
|
||||
|
||||
u[i+1][0] = cosf(randTemp);
|
||||
u[i+1][1] = sinf(randTemp);
|
||||
u[i+2][0] = cosf(randTemp2);
|
||||
u[i+2][1] = sinf(randTemp2);
|
||||
u[i+3][0] = cosf(randTemp3);
|
||||
u[i+3][1] = sinf(randTemp3);
|
||||
u[i+4][0] = cosf(randTemp4);
|
||||
u[i+4][1] = sinf(randTemp4);
|
||||
}
|
||||
|
||||
// Reject LF noise
|
||||
float* u_ptr = &u[1][0];
|
||||
float noise2, noise3, noise4;
|
||||
float tmp1f, tmp2f, tmp3f, tmp4f, tmp5f, tmp6f, tmp7f, tmp8f;
|
||||
|
||||
u[0][0] = 0;
|
||||
u[0][1] = 0;
|
||||
for (i = 1; i < PART_LEN1; i+=4) {
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"lwc1 %[noise], 4(%[noisePow]) \n\t"
|
||||
"lwc1 %[noise2], 8(%[noisePow]) \n\t"
|
||||
"lwc1 %[noise3], 12(%[noisePow]) \n\t"
|
||||
"lwc1 %[noise4], 16(%[noisePow]) \n\t"
|
||||
"sqrt.s %[noise], %[noise] \n\t"
|
||||
"sqrt.s %[noise2], %[noise2] \n\t"
|
||||
"sqrt.s %[noise3], %[noise3] \n\t"
|
||||
"sqrt.s %[noise4], %[noise4] \n\t"
|
||||
"lwc1 %[tmp1f], 0(%[u_ptr]) \n\t"
|
||||
"lwc1 %[tmp2f], 4(%[u_ptr]) \n\t"
|
||||
"lwc1 %[tmp3f], 8(%[u_ptr]) \n\t"
|
||||
"lwc1 %[tmp4f], 12(%[u_ptr]) \n\t"
|
||||
"lwc1 %[tmp5f], 16(%[u_ptr]) \n\t"
|
||||
"lwc1 %[tmp6f], 20(%[u_ptr]) \n\t"
|
||||
"lwc1 %[tmp7f], 24(%[u_ptr]) \n\t"
|
||||
"lwc1 %[tmp8f], 28(%[u_ptr]) \n\t"
|
||||
"addiu %[noisePow], %[noisePow], 16 \n\t"
|
||||
"mul.s %[tmp1f], %[tmp1f], %[noise] \n\t"
|
||||
"mul.s %[tmp2f], %[tmp2f], %[noise] \n\t"
|
||||
"mul.s %[tmp3f], %[tmp3f], %[noise2] \n\t"
|
||||
"mul.s %[tmp4f], %[tmp4f], %[noise2] \n\t"
|
||||
"mul.s %[tmp5f], %[tmp5f], %[noise3] \n\t"
|
||||
"mul.s %[tmp6f], %[tmp6f], %[noise3] \n\t"
|
||||
"swc1 %[tmp1f], 0(%[u_ptr]) \n\t"
|
||||
"swc1 %[tmp3f], 8(%[u_ptr]) \n\t"
|
||||
"mul.s %[tmp8f], %[tmp8f], %[noise4] \n\t"
|
||||
"mul.s %[tmp7f], %[tmp7f], %[noise4] \n\t"
|
||||
"neg.s %[tmp2f] \n\t"
|
||||
"neg.s %[tmp4f] \n\t"
|
||||
"neg.s %[tmp6f] \n\t"
|
||||
"neg.s %[tmp8f] \n\t"
|
||||
"swc1 %[tmp5f], 16(%[u_ptr]) \n\t"
|
||||
"swc1 %[tmp7f], 24(%[u_ptr]) \n\t"
|
||||
"swc1 %[tmp2f], 4(%[u_ptr]) \n\t"
|
||||
"swc1 %[tmp4f], 12(%[u_ptr]) \n\t"
|
||||
"swc1 %[tmp6f], 20(%[u_ptr]) \n\t"
|
||||
"swc1 %[tmp8f], 28(%[u_ptr]) \n\t"
|
||||
"addiu %[u_ptr], %[u_ptr], 32 \n\t"
|
||||
".set pop \n\t"
|
||||
: [u_ptr] "+r" (u_ptr), [noisePow] "+r" (noisePow),
|
||||
[noise] "=&f" (noise), [noise2] "=&f" (noise2),
|
||||
[noise3] "=&f" (noise3), [noise4] "=&f" (noise4),
|
||||
[tmp1f] "=&f" (tmp1f), [tmp2f] "=&f" (tmp2f),
|
||||
[tmp3f] "=&f" (tmp3f), [tmp4f] "=&f" (tmp4f),
|
||||
[tmp5f] "=&f" (tmp5f), [tmp6f] "=&f" (tmp6f),
|
||||
[tmp7f] "=&f" (tmp7f), [tmp8f] "=&f" (tmp8f)
|
||||
:
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
u[PART_LEN][1] = 0;
|
||||
noisePow -= PART_LEN;
|
||||
|
||||
u_ptr = &u[0][0];
|
||||
float* u_ptr_end = &u[PART_LEN][0];
|
||||
float* efw_ptr_0 = &efw[0][0];
|
||||
float* efw_ptr_1 = &efw[1][0];
|
||||
float tmp9f, tmp10f;
|
||||
const float tmp1c = 1.0;
|
||||
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"1: \n\t"
|
||||
"lwc1 %[tmp1f], 0(%[lambda]) \n\t"
|
||||
"lwc1 %[tmp6f], 4(%[lambda]) \n\t"
|
||||
"addiu %[lambda], %[lambda], 8 \n\t"
|
||||
"c.lt.s %[tmp1f], %[tmp1c] \n\t"
|
||||
"bc1f 4f \n\t"
|
||||
" nop \n\t"
|
||||
"c.lt.s %[tmp6f], %[tmp1c] \n\t"
|
||||
"bc1f 3f \n\t"
|
||||
" nop \n\t"
|
||||
"2: \n\t"
|
||||
"mul.s %[tmp1f], %[tmp1f], %[tmp1f] \n\t"
|
||||
"mul.s %[tmp6f], %[tmp6f], %[tmp6f] \n\t"
|
||||
"sub.s %[tmp1f], %[tmp1c], %[tmp1f] \n\t"
|
||||
"sub.s %[tmp6f], %[tmp1c], %[tmp6f] \n\t"
|
||||
"sqrt.s %[tmp1f], %[tmp1f] \n\t"
|
||||
"sqrt.s %[tmp6f], %[tmp6f] \n\t"
|
||||
"lwc1 %[tmp2f], 0(%[efw_ptr_0]) \n\t"
|
||||
"lwc1 %[tmp3f], 0(%[u_ptr]) \n\t"
|
||||
"lwc1 %[tmp7f], 4(%[efw_ptr_0]) \n\t"
|
||||
"lwc1 %[tmp8f], 8(%[u_ptr]) \n\t"
|
||||
"lwc1 %[tmp4f], 0(%[efw_ptr_1]) \n\t"
|
||||
"lwc1 %[tmp5f], 4(%[u_ptr]) \n\t"
|
||||
"lwc1 %[tmp9f], 4(%[efw_ptr_1]) \n\t"
|
||||
"lwc1 %[tmp10f], 12(%[u_ptr]) \n\t"
|
||||
#if !defined(MIPS32_R2_LE)
|
||||
"mul.s %[tmp3f], %[tmp1f], %[tmp3f] \n\t"
|
||||
"add.s %[tmp2f], %[tmp2f], %[tmp3f] \n\t"
|
||||
"mul.s %[tmp3f], %[tmp1f], %[tmp5f] \n\t"
|
||||
"add.s %[tmp4f], %[tmp4f], %[tmp3f] \n\t"
|
||||
"mul.s %[tmp3f], %[tmp6f], %[tmp8f] \n\t"
|
||||
"add.s %[tmp7f], %[tmp7f], %[tmp3f] \n\t"
|
||||
"mul.s %[tmp3f], %[tmp6f], %[tmp10f] \n\t"
|
||||
"add.s %[tmp9f], %[tmp9f], %[tmp3f] \n\t"
|
||||
#else // #if !defined(MIPS32_R2_LE)
|
||||
"madd.s %[tmp2f], %[tmp2f], %[tmp1f], %[tmp3f] \n\t"
|
||||
"madd.s %[tmp4f], %[tmp4f], %[tmp1f], %[tmp5f] \n\t"
|
||||
"madd.s %[tmp7f], %[tmp7f], %[tmp6f], %[tmp8f] \n\t"
|
||||
"madd.s %[tmp9f], %[tmp9f], %[tmp6f], %[tmp10f] \n\t"
|
||||
#endif // #if !defined(MIPS32_R2_LE)
|
||||
"swc1 %[tmp2f], 0(%[efw_ptr_0]) \n\t"
|
||||
"swc1 %[tmp4f], 0(%[efw_ptr_1]) \n\t"
|
||||
"swc1 %[tmp7f], 4(%[efw_ptr_0]) \n\t"
|
||||
"b 5f \n\t"
|
||||
" swc1 %[tmp9f], 4(%[efw_ptr_1]) \n\t"
|
||||
"3: \n\t"
|
||||
"mul.s %[tmp1f], %[tmp1f], %[tmp1f] \n\t"
|
||||
"sub.s %[tmp1f], %[tmp1c], %[tmp1f] \n\t"
|
||||
"sqrt.s %[tmp1f], %[tmp1f] \n\t"
|
||||
"lwc1 %[tmp2f], 0(%[efw_ptr_0]) \n\t"
|
||||
"lwc1 %[tmp3f], 0(%[u_ptr]) \n\t"
|
||||
"lwc1 %[tmp4f], 0(%[efw_ptr_1]) \n\t"
|
||||
"lwc1 %[tmp5f], 4(%[u_ptr]) \n\t"
|
||||
#if !defined(MIPS32_R2_LE)
|
||||
"mul.s %[tmp3f], %[tmp1f], %[tmp3f] \n\t"
|
||||
"add.s %[tmp2f], %[tmp2f], %[tmp3f] \n\t"
|
||||
"mul.s %[tmp3f], %[tmp1f], %[tmp5f] \n\t"
|
||||
"add.s %[tmp4f], %[tmp4f], %[tmp3f] \n\t"
|
||||
#else // #if !defined(MIPS32_R2_LE)
|
||||
"madd.s %[tmp2f], %[tmp2f], %[tmp1f], %[tmp3f] \n\t"
|
||||
"madd.s %[tmp4f], %[tmp4f], %[tmp1f], %[tmp5f] \n\t"
|
||||
#endif // #if !defined(MIPS32_R2_LE)
|
||||
"swc1 %[tmp2f], 0(%[efw_ptr_0]) \n\t"
|
||||
"b 5f \n\t"
|
||||
" swc1 %[tmp4f], 0(%[efw_ptr_1]) \n\t"
|
||||
"4: \n\t"
|
||||
"c.lt.s %[tmp6f], %[tmp1c] \n\t"
|
||||
"bc1f 5f \n\t"
|
||||
" nop \n\t"
|
||||
"mul.s %[tmp6f], %[tmp6f], %[tmp6f] \n\t"
|
||||
"sub.s %[tmp6f], %[tmp1c], %[tmp6f] \n\t"
|
||||
"sqrt.s %[tmp6f], %[tmp6f] \n\t"
|
||||
"lwc1 %[tmp7f], 4(%[efw_ptr_0]) \n\t"
|
||||
"lwc1 %[tmp8f], 8(%[u_ptr]) \n\t"
|
||||
"lwc1 %[tmp9f], 4(%[efw_ptr_1]) \n\t"
|
||||
"lwc1 %[tmp10f], 12(%[u_ptr]) \n\t"
|
||||
#if !defined(MIPS32_R2_LE)
|
||||
"mul.s %[tmp3f], %[tmp6f], %[tmp8f] \n\t"
|
||||
"add.s %[tmp7f], %[tmp7f], %[tmp3f] \n\t"
|
||||
"mul.s %[tmp3f], %[tmp6f], %[tmp10f] \n\t"
|
||||
"add.s %[tmp9f], %[tmp9f], %[tmp3f] \n\t"
|
||||
#else // #if !defined(MIPS32_R2_LE)
|
||||
"madd.s %[tmp7f], %[tmp7f], %[tmp6f], %[tmp8f] \n\t"
|
||||
"madd.s %[tmp9f], %[tmp9f], %[tmp6f], %[tmp10f] \n\t"
|
||||
#endif // #if !defined(MIPS32_R2_LE)
|
||||
"swc1 %[tmp7f], 4(%[efw_ptr_0]) \n\t"
|
||||
"swc1 %[tmp9f], 4(%[efw_ptr_1]) \n\t"
|
||||
"5: \n\t"
|
||||
"addiu %[u_ptr], %[u_ptr], 16 \n\t"
|
||||
"addiu %[efw_ptr_0], %[efw_ptr_0], 8 \n\t"
|
||||
"bne %[u_ptr], %[u_ptr_end], 1b \n\t"
|
||||
" addiu %[efw_ptr_1], %[efw_ptr_1], 8 \n\t"
|
||||
".set pop \n\t"
|
||||
: [lambda] "+r" (lambda), [u_ptr] "+r" (u_ptr),
|
||||
[efw_ptr_0] "+r" (efw_ptr_0), [efw_ptr_1] "+r" (efw_ptr_1),
|
||||
[tmp1f] "=&f" (tmp1f), [tmp2f] "=&f" (tmp2f), [tmp3f] "=&f" (tmp3f),
|
||||
[tmp4f] "=&f" (tmp4f), [tmp5f] "=&f" (tmp5f),
|
||||
[tmp6f] "=&f" (tmp6f), [tmp7f] "=&f" (tmp7f), [tmp8f] "=&f" (tmp8f),
|
||||
[tmp9f] "=&f" (tmp9f), [tmp10f] "=&f" (tmp10f)
|
||||
: [tmp1c] "f" (tmp1c), [u_ptr_end] "r" (u_ptr_end)
|
||||
: "memory"
|
||||
);
|
||||
|
||||
lambda -= PART_LEN;
|
||||
tmp = sqrtf(WEBRTC_SPL_MAX(1 - lambda[PART_LEN] * lambda[PART_LEN], 0));
|
||||
//tmp = 1 - lambda[i];
|
||||
efw[0][PART_LEN] += tmp * u[PART_LEN][0];
|
||||
efw[1][PART_LEN] += tmp * u[PART_LEN][1];
|
||||
|
||||
// For H band comfort noise
|
||||
// TODO: don't compute noise and "tmp" twice. Use the previous results.
|
||||
noiseAvg = 0.0;
|
||||
tmpAvg = 0.0;
|
||||
num = 0;
|
||||
if ((aec->sampFreq == 32000 || aec->sampFreq == 48000) && flagHbandCn == 1) {
|
||||
for (i = 0; i < PART_LEN; i++) {
|
||||
rand[i] = ((float)randW16[i]) / 32768;
|
||||
}
|
||||
|
||||
// average noise scale
|
||||
// average over second half of freq spectrum (i.e., 4->8khz)
|
||||
// TODO: we shouldn't need num. We know how many elements we're summing.
|
||||
for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
|
||||
num++;
|
||||
noiseAvg += sqrtf(noisePow[i]);
|
||||
}
|
||||
noiseAvg /= (float)num;
|
||||
|
||||
// average nlp scale
|
||||
// average over second half of freq spectrum (i.e., 4->8khz)
|
||||
// TODO: we shouldn't need num. We know how many elements we're summing.
|
||||
num = 0;
|
||||
for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
|
||||
num++;
|
||||
tmpAvg += sqrtf(WEBRTC_SPL_MAX(1 - lambda[i] * lambda[i], 0));
|
||||
}
|
||||
tmpAvg /= (float)num;
|
||||
|
||||
// Use average noise for H band
|
||||
// TODO: we should probably have a new random vector here.
|
||||
// Reject LF noise
|
||||
u[0][0] = 0;
|
||||
u[0][1] = 0;
|
||||
for (i = 1; i < PART_LEN1; i++) {
|
||||
tmp = pi2 * rand[i - 1];
|
||||
|
||||
// Use average noise for H band
|
||||
u[i][0] = noiseAvg * (float)cos(tmp);
|
||||
u[i][1] = -noiseAvg * (float)sin(tmp);
|
||||
}
|
||||
u[PART_LEN][1] = 0;
|
||||
|
||||
for (i = 0; i < PART_LEN1; i++) {
|
||||
// Use average NLP weight for H band
|
||||
comfortNoiseHband[i][0] = tmpAvg * u[i][0];
|
||||
comfortNoiseHband[i][1] = tmpAvg * u[i][1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAec_FilterFar_mips(AecCore* aec, float yf[2][PART_LEN1]) {
|
||||
int i;
|
||||
for (i = 0; i < aec->num_partitions; i++) {
|
||||
int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
|
||||
int pos = i * PART_LEN1;
|
||||
// Check for wrap
|
||||
if (i + aec->xfBufBlockPos >= aec->num_partitions) {
|
||||
xPos -= aec->num_partitions * (PART_LEN1);
|
||||
}
|
||||
float* yf0 = yf[0];
|
||||
float* yf1 = yf[1];
|
||||
float* aRe = aec->xfBuf[0] + xPos;
|
||||
float* aIm = aec->xfBuf[1] + xPos;
|
||||
float* bRe = aec->wfBuf[0] + pos;
|
||||
float* bIm = aec->wfBuf[1] + pos;
|
||||
float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13;
|
||||
int len = PART_LEN1 >> 1;
|
||||
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"1: \n\t"
|
||||
"lwc1 %[f0], 0(%[aRe]) \n\t"
|
||||
"lwc1 %[f1], 0(%[bRe]) \n\t"
|
||||
"lwc1 %[f2], 0(%[bIm]) \n\t"
|
||||
"lwc1 %[f3], 0(%[aIm]) \n\t"
|
||||
"lwc1 %[f4], 4(%[aRe]) \n\t"
|
||||
"lwc1 %[f5], 4(%[bRe]) \n\t"
|
||||
"lwc1 %[f6], 4(%[bIm]) \n\t"
|
||||
"mul.s %[f8], %[f0], %[f1] \n\t"
|
||||
"mul.s %[f0], %[f0], %[f2] \n\t"
|
||||
"mul.s %[f9], %[f4], %[f5] \n\t"
|
||||
"mul.s %[f4], %[f4], %[f6] \n\t"
|
||||
"lwc1 %[f7], 4(%[aIm]) \n\t"
|
||||
#if !defined(MIPS32_R2_LE)
|
||||
"mul.s %[f12], %[f2], %[f3] \n\t"
|
||||
"mul.s %[f1], %[f3], %[f1] \n\t"
|
||||
"mul.s %[f11], %[f6], %[f7] \n\t"
|
||||
"addiu %[aRe], %[aRe], 8 \n\t"
|
||||
"addiu %[aIm], %[aIm], 8 \n\t"
|
||||
"addiu %[len], %[len], -1 \n\t"
|
||||
"sub.s %[f8], %[f8], %[f12] \n\t"
|
||||
"mul.s %[f12], %[f7], %[f5] \n\t"
|
||||
"lwc1 %[f2], 0(%[yf0]) \n\t"
|
||||
"add.s %[f1], %[f0], %[f1] \n\t"
|
||||
"lwc1 %[f3], 0(%[yf1]) \n\t"
|
||||
"sub.s %[f9], %[f9], %[f11] \n\t"
|
||||
"lwc1 %[f6], 4(%[yf0]) \n\t"
|
||||
"add.s %[f4], %[f4], %[f12] \n\t"
|
||||
#else // #if !defined(MIPS32_R2_LE)
|
||||
"addiu %[aRe], %[aRe], 8 \n\t"
|
||||
"addiu %[aIm], %[aIm], 8 \n\t"
|
||||
"addiu %[len], %[len], -1 \n\t"
|
||||
"nmsub.s %[f8], %[f8], %[f2], %[f3] \n\t"
|
||||
"lwc1 %[f2], 0(%[yf0]) \n\t"
|
||||
"madd.s %[f1], %[f0], %[f3], %[f1] \n\t"
|
||||
"lwc1 %[f3], 0(%[yf1]) \n\t"
|
||||
"nmsub.s %[f9], %[f9], %[f6], %[f7] \n\t"
|
||||
"lwc1 %[f6], 4(%[yf0]) \n\t"
|
||||
"madd.s %[f4], %[f4], %[f7], %[f5] \n\t"
|
||||
#endif // #if !defined(MIPS32_R2_LE)
|
||||
"lwc1 %[f5], 4(%[yf1]) \n\t"
|
||||
"add.s %[f2], %[f2], %[f8] \n\t"
|
||||
"addiu %[bRe], %[bRe], 8 \n\t"
|
||||
"addiu %[bIm], %[bIm], 8 \n\t"
|
||||
"add.s %[f3], %[f3], %[f1] \n\t"
|
||||
"add.s %[f6], %[f6], %[f9] \n\t"
|
||||
"add.s %[f5], %[f5], %[f4] \n\t"
|
||||
"swc1 %[f2], 0(%[yf0]) \n\t"
|
||||
"swc1 %[f3], 0(%[yf1]) \n\t"
|
||||
"swc1 %[f6], 4(%[yf0]) \n\t"
|
||||
"swc1 %[f5], 4(%[yf1]) \n\t"
|
||||
"addiu %[yf0], %[yf0], 8 \n\t"
|
||||
"bgtz %[len], 1b \n\t"
|
||||
" addiu %[yf1], %[yf1], 8 \n\t"
|
||||
"lwc1 %[f0], 0(%[aRe]) \n\t"
|
||||
"lwc1 %[f1], 0(%[bRe]) \n\t"
|
||||
"lwc1 %[f2], 0(%[bIm]) \n\t"
|
||||
"lwc1 %[f3], 0(%[aIm]) \n\t"
|
||||
"mul.s %[f8], %[f0], %[f1] \n\t"
|
||||
"mul.s %[f0], %[f0], %[f2] \n\t"
|
||||
#if !defined(MIPS32_R2_LE)
|
||||
"mul.s %[f12], %[f2], %[f3] \n\t"
|
||||
"mul.s %[f1], %[f3], %[f1] \n\t"
|
||||
"sub.s %[f8], %[f8], %[f12] \n\t"
|
||||
"lwc1 %[f2], 0(%[yf0]) \n\t"
|
||||
"add.s %[f1], %[f0], %[f1] \n\t"
|
||||
"lwc1 %[f3], 0(%[yf1]) \n\t"
|
||||
#else // #if !defined(MIPS32_R2_LE)
|
||||
"nmsub.s %[f8], %[f8], %[f2], %[f3] \n\t"
|
||||
"lwc1 %[f2], 0(%[yf0]) \n\t"
|
||||
"madd.s %[f1], %[f0], %[f3], %[f1] \n\t"
|
||||
"lwc1 %[f3], 0(%[yf1]) \n\t"
|
||||
#endif // #if !defined(MIPS32_R2_LE)
|
||||
"add.s %[f2], %[f2], %[f8] \n\t"
|
||||
"add.s %[f3], %[f3], %[f1] \n\t"
|
||||
"swc1 %[f2], 0(%[yf0]) \n\t"
|
||||
"swc1 %[f3], 0(%[yf1]) \n\t"
|
||||
".set pop \n\t"
|
||||
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
|
||||
[f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
|
||||
[f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
|
||||
[f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
|
||||
[f12] "=&f" (f12), [f13] "=&f" (f13), [aRe] "+r" (aRe),
|
||||
[aIm] "+r" (aIm), [bRe] "+r" (bRe), [bIm] "+r" (bIm),
|
||||
[yf0] "+r" (yf0), [yf1] "+r" (yf1), [len] "+r" (len)
|
||||
:
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAec_FilterAdaptation_mips(AecCore* aec,
|
||||
float* fft,
|
||||
float ef[2][PART_LEN1]) {
|
||||
int i;
|
||||
for (i = 0; i < aec->num_partitions; i++) {
|
||||
int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
|
||||
int pos;
|
||||
// Check for wrap
|
||||
if (i + aec->xfBufBlockPos >= aec->num_partitions) {
|
||||
xPos -= aec->num_partitions * PART_LEN1;
|
||||
}
|
||||
|
||||
pos = i * PART_LEN1;
|
||||
float* aRe = aec->xfBuf[0] + xPos;
|
||||
float* aIm = aec->xfBuf[1] + xPos;
|
||||
float* bRe = ef[0];
|
||||
float* bIm = ef[1];
|
||||
float* fft_tmp;
|
||||
|
||||
float f0, f1, f2, f3, f4, f5, f6 ,f7, f8, f9, f10, f11, f12;
|
||||
int len = PART_LEN >> 1;
|
||||
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"addiu %[fft_tmp], %[fft], 0 \n\t"
|
||||
"1: \n\t"
|
||||
"lwc1 %[f0], 0(%[aRe]) \n\t"
|
||||
"lwc1 %[f1], 0(%[bRe]) \n\t"
|
||||
"lwc1 %[f2], 0(%[bIm]) \n\t"
|
||||
"lwc1 %[f4], 4(%[aRe]) \n\t"
|
||||
"lwc1 %[f5], 4(%[bRe]) \n\t"
|
||||
"lwc1 %[f6], 4(%[bIm]) \n\t"
|
||||
"addiu %[aRe], %[aRe], 8 \n\t"
|
||||
"addiu %[bRe], %[bRe], 8 \n\t"
|
||||
"mul.s %[f8], %[f0], %[f1] \n\t"
|
||||
"mul.s %[f0], %[f0], %[f2] \n\t"
|
||||
"lwc1 %[f3], 0(%[aIm]) \n\t"
|
||||
"mul.s %[f9], %[f4], %[f5] \n\t"
|
||||
"lwc1 %[f7], 4(%[aIm]) \n\t"
|
||||
"mul.s %[f4], %[f4], %[f6] \n\t"
|
||||
#if !defined(MIPS32_R2_LE)
|
||||
"mul.s %[f10], %[f3], %[f2] \n\t"
|
||||
"mul.s %[f1], %[f3], %[f1] \n\t"
|
||||
"mul.s %[f11], %[f7], %[f6] \n\t"
|
||||
"mul.s %[f5], %[f7], %[f5] \n\t"
|
||||
"addiu %[aIm], %[aIm], 8 \n\t"
|
||||
"addiu %[bIm], %[bIm], 8 \n\t"
|
||||
"addiu %[len], %[len], -1 \n\t"
|
||||
"add.s %[f8], %[f8], %[f10] \n\t"
|
||||
"sub.s %[f1], %[f0], %[f1] \n\t"
|
||||
"add.s %[f9], %[f9], %[f11] \n\t"
|
||||
"sub.s %[f5], %[f4], %[f5] \n\t"
|
||||
#else // #if !defined(MIPS32_R2_LE)
|
||||
"addiu %[aIm], %[aIm], 8 \n\t"
|
||||
"addiu %[bIm], %[bIm], 8 \n\t"
|
||||
"addiu %[len], %[len], -1 \n\t"
|
||||
"madd.s %[f8], %[f8], %[f3], %[f2] \n\t"
|
||||
"nmsub.s %[f1], %[f0], %[f3], %[f1] \n\t"
|
||||
"madd.s %[f9], %[f9], %[f7], %[f6] \n\t"
|
||||
"nmsub.s %[f5], %[f4], %[f7], %[f5] \n\t"
|
||||
#endif // #if !defined(MIPS32_R2_LE)
|
||||
"swc1 %[f8], 0(%[fft_tmp]) \n\t"
|
||||
"swc1 %[f1], 4(%[fft_tmp]) \n\t"
|
||||
"swc1 %[f9], 8(%[fft_tmp]) \n\t"
|
||||
"swc1 %[f5], 12(%[fft_tmp]) \n\t"
|
||||
"bgtz %[len], 1b \n\t"
|
||||
" addiu %[fft_tmp], %[fft_tmp], 16 \n\t"
|
||||
"lwc1 %[f0], 0(%[aRe]) \n\t"
|
||||
"lwc1 %[f1], 0(%[bRe]) \n\t"
|
||||
"lwc1 %[f2], 0(%[bIm]) \n\t"
|
||||
"lwc1 %[f3], 0(%[aIm]) \n\t"
|
||||
"mul.s %[f8], %[f0], %[f1] \n\t"
|
||||
#if !defined(MIPS32_R2_LE)
|
||||
"mul.s %[f10], %[f3], %[f2] \n\t"
|
||||
"add.s %[f8], %[f8], %[f10] \n\t"
|
||||
#else // #if !defined(MIPS32_R2_LE)
|
||||
"madd.s %[f8], %[f8], %[f3], %[f2] \n\t"
|
||||
#endif // #if !defined(MIPS32_R2_LE)
|
||||
"swc1 %[f8], 4(%[fft]) \n\t"
|
||||
".set pop \n\t"
|
||||
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
|
||||
[f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
|
||||
[f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
|
||||
[f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
|
||||
[f12] "=&f" (f12), [aRe] "+r" (aRe), [aIm] "+r" (aIm),
|
||||
[bRe] "+r" (bRe), [bIm] "+r" (bIm), [fft_tmp] "=&r" (fft_tmp),
|
||||
[len] "+r" (len)
|
||||
: [fft] "r" (fft)
|
||||
: "memory"
|
||||
);
|
||||
|
||||
aec_rdft_inverse_128(fft);
|
||||
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
|
||||
|
||||
// fft scaling
|
||||
{
|
||||
float scale = 2.0f / PART_LEN2;
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"addiu %[fft_tmp], %[fft], 0 \n\t"
|
||||
"addiu %[len], $zero, 8 \n\t"
|
||||
"1: \n\t"
|
||||
"addiu %[len], %[len], -1 \n\t"
|
||||
"lwc1 %[f0], 0(%[fft_tmp]) \n\t"
|
||||
"lwc1 %[f1], 4(%[fft_tmp]) \n\t"
|
||||
"lwc1 %[f2], 8(%[fft_tmp]) \n\t"
|
||||
"lwc1 %[f3], 12(%[fft_tmp]) \n\t"
|
||||
"mul.s %[f0], %[f0], %[scale] \n\t"
|
||||
"mul.s %[f1], %[f1], %[scale] \n\t"
|
||||
"mul.s %[f2], %[f2], %[scale] \n\t"
|
||||
"mul.s %[f3], %[f3], %[scale] \n\t"
|
||||
"lwc1 %[f4], 16(%[fft_tmp]) \n\t"
|
||||
"lwc1 %[f5], 20(%[fft_tmp]) \n\t"
|
||||
"lwc1 %[f6], 24(%[fft_tmp]) \n\t"
|
||||
"lwc1 %[f7], 28(%[fft_tmp]) \n\t"
|
||||
"mul.s %[f4], %[f4], %[scale] \n\t"
|
||||
"mul.s %[f5], %[f5], %[scale] \n\t"
|
||||
"mul.s %[f6], %[f6], %[scale] \n\t"
|
||||
"mul.s %[f7], %[f7], %[scale] \n\t"
|
||||
"swc1 %[f0], 0(%[fft_tmp]) \n\t"
|
||||
"swc1 %[f1], 4(%[fft_tmp]) \n\t"
|
||||
"swc1 %[f2], 8(%[fft_tmp]) \n\t"
|
||||
"swc1 %[f3], 12(%[fft_tmp]) \n\t"
|
||||
"swc1 %[f4], 16(%[fft_tmp]) \n\t"
|
||||
"swc1 %[f5], 20(%[fft_tmp]) \n\t"
|
||||
"swc1 %[f6], 24(%[fft_tmp]) \n\t"
|
||||
"swc1 %[f7], 28(%[fft_tmp]) \n\t"
|
||||
"bgtz %[len], 1b \n\t"
|
||||
" addiu %[fft_tmp], %[fft_tmp], 32 \n\t"
|
||||
".set pop \n\t"
|
||||
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
|
||||
[f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
|
||||
[f6] "=&f" (f6), [f7] "=&f" (f7), [len] "=&r" (len),
|
||||
[fft_tmp] "=&r" (fft_tmp)
|
||||
: [scale] "f" (scale), [fft] "r" (fft)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
aec_rdft_forward_128(fft);
|
||||
aRe = aec->wfBuf[0] + pos;
|
||||
aIm = aec->wfBuf[1] + pos;
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"addiu %[fft_tmp], %[fft], 0 \n\t"
|
||||
"addiu %[len], $zero, 31 \n\t"
|
||||
"lwc1 %[f0], 0(%[aRe]) \n\t"
|
||||
"lwc1 %[f1], 0(%[fft_tmp]) \n\t"
|
||||
"lwc1 %[f2], 256(%[aRe]) \n\t"
|
||||
"lwc1 %[f3], 4(%[fft_tmp]) \n\t"
|
||||
"lwc1 %[f4], 4(%[aRe]) \n\t"
|
||||
"lwc1 %[f5], 8(%[fft_tmp]) \n\t"
|
||||
"lwc1 %[f6], 4(%[aIm]) \n\t"
|
||||
"lwc1 %[f7], 12(%[fft_tmp]) \n\t"
|
||||
"add.s %[f0], %[f0], %[f1] \n\t"
|
||||
"add.s %[f2], %[f2], %[f3] \n\t"
|
||||
"add.s %[f4], %[f4], %[f5] \n\t"
|
||||
"add.s %[f6], %[f6], %[f7] \n\t"
|
||||
"addiu %[fft_tmp], %[fft_tmp], 16 \n\t"
|
||||
"swc1 %[f0], 0(%[aRe]) \n\t"
|
||||
"swc1 %[f2], 256(%[aRe]) \n\t"
|
||||
"swc1 %[f4], 4(%[aRe]) \n\t"
|
||||
"addiu %[aRe], %[aRe], 8 \n\t"
|
||||
"swc1 %[f6], 4(%[aIm]) \n\t"
|
||||
"addiu %[aIm], %[aIm], 8 \n\t"
|
||||
"1: \n\t"
|
||||
"lwc1 %[f0], 0(%[aRe]) \n\t"
|
||||
"lwc1 %[f1], 0(%[fft_tmp]) \n\t"
|
||||
"lwc1 %[f2], 0(%[aIm]) \n\t"
|
||||
"lwc1 %[f3], 4(%[fft_tmp]) \n\t"
|
||||
"lwc1 %[f4], 4(%[aRe]) \n\t"
|
||||
"lwc1 %[f5], 8(%[fft_tmp]) \n\t"
|
||||
"lwc1 %[f6], 4(%[aIm]) \n\t"
|
||||
"lwc1 %[f7], 12(%[fft_tmp]) \n\t"
|
||||
"add.s %[f0], %[f0], %[f1] \n\t"
|
||||
"add.s %[f2], %[f2], %[f3] \n\t"
|
||||
"add.s %[f4], %[f4], %[f5] \n\t"
|
||||
"add.s %[f6], %[f6], %[f7] \n\t"
|
||||
"addiu %[len], %[len], -1 \n\t"
|
||||
"addiu %[fft_tmp], %[fft_tmp], 16 \n\t"
|
||||
"swc1 %[f0], 0(%[aRe]) \n\t"
|
||||
"swc1 %[f2], 0(%[aIm]) \n\t"
|
||||
"swc1 %[f4], 4(%[aRe]) \n\t"
|
||||
"addiu %[aRe], %[aRe], 8 \n\t"
|
||||
"swc1 %[f6], 4(%[aIm]) \n\t"
|
||||
"bgtz %[len], 1b \n\t"
|
||||
" addiu %[aIm], %[aIm], 8 \n\t"
|
||||
".set pop \n\t"
|
||||
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
|
||||
[f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
|
||||
[f6] "=&f" (f6), [f7] "=&f" (f7), [len] "=&r" (len),
|
||||
[fft_tmp] "=&r" (fft_tmp), [aRe] "+r" (aRe), [aIm] "+r" (aIm)
|
||||
: [fft] "r" (fft)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAec_OverdriveAndSuppress_mips(AecCore* aec,
|
||||
float hNl[PART_LEN1],
|
||||
const float hNlFb,
|
||||
float efw[2][PART_LEN1]) {
|
||||
int i;
|
||||
const float one = 1.0;
|
||||
float* p_hNl;
|
||||
float* p_efw0;
|
||||
float* p_efw1;
|
||||
float* p_WebRtcAec_wC;
|
||||
float temp1, temp2, temp3, temp4;
|
||||
|
||||
p_hNl = &hNl[0];
|
||||
p_efw0 = &efw[0][0];
|
||||
p_efw1 = &efw[1][0];
|
||||
p_WebRtcAec_wC = (float*)&WebRtcAec_weightCurve[0];
|
||||
|
||||
for (i = 0; i < PART_LEN1; i++) {
|
||||
// Weight subbands
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"lwc1 %[temp1], 0(%[p_hNl]) \n\t"
|
||||
"lwc1 %[temp2], 0(%[p_wC]) \n\t"
|
||||
"c.lt.s %[hNlFb], %[temp1] \n\t"
|
||||
"bc1f 1f \n\t"
|
||||
" mul.s %[temp3], %[temp2], %[hNlFb] \n\t"
|
||||
"sub.s %[temp4], %[one], %[temp2] \n\t"
|
||||
#if !defined(MIPS32_R2_LE)
|
||||
"mul.s %[temp1], %[temp1], %[temp4] \n\t"
|
||||
"add.s %[temp1], %[temp3], %[temp1] \n\t"
|
||||
#else // #if !defined(MIPS32_R2_LE)
|
||||
"madd.s %[temp1], %[temp3], %[temp1], %[temp4] \n\t"
|
||||
#endif // #if !defined(MIPS32_R2_LE)
|
||||
"swc1 %[temp1], 0(%[p_hNl]) \n\t"
|
||||
"1: \n\t"
|
||||
"addiu %[p_wC], %[p_wC], 4 \n\t"
|
||||
".set pop \n\t"
|
||||
: [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), [temp3] "=&f" (temp3),
|
||||
[temp4] "=&f" (temp4), [p_wC] "+r" (p_WebRtcAec_wC)
|
||||
: [hNlFb] "f" (hNlFb), [one] "f" (one), [p_hNl] "r" (p_hNl)
|
||||
: "memory"
|
||||
);
|
||||
|
||||
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
|
||||
|
||||
__asm __volatile (
|
||||
"lwc1 %[temp1], 0(%[p_hNl]) \n\t"
|
||||
"lwc1 %[temp3], 0(%[p_efw1]) \n\t"
|
||||
"lwc1 %[temp2], 0(%[p_efw0]) \n\t"
|
||||
"addiu %[p_hNl], %[p_hNl], 4 \n\t"
|
||||
"mul.s %[temp3], %[temp3], %[temp1] \n\t"
|
||||
"mul.s %[temp2], %[temp2], %[temp1] \n\t"
|
||||
"addiu %[p_efw0], %[p_efw0], 4 \n\t"
|
||||
"addiu %[p_efw1], %[p_efw1], 4 \n\t"
|
||||
"neg.s %[temp4], %[temp3] \n\t"
|
||||
"swc1 %[temp2], -4(%[p_efw0]) \n\t"
|
||||
"swc1 %[temp4], -4(%[p_efw1]) \n\t"
|
||||
: [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), [temp3] "=&f" (temp3),
|
||||
[temp4] "=&f" (temp4), [p_efw0] "+r" (p_efw0), [p_efw1] "+r" (p_efw1),
|
||||
[p_hNl] "+r" (p_hNl)
|
||||
:
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAec_ScaleErrorSignal_mips(AecCore* aec, float ef[2][PART_LEN1]) {
|
||||
const float mu = aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
|
||||
const float error_threshold = aec->extended_filter_enabled
|
||||
? kExtendedErrorThreshold
|
||||
: aec->normal_error_threshold;
|
||||
int len = (PART_LEN1);
|
||||
float* ef0 = ef[0];
|
||||
float* ef1 = ef[1];
|
||||
float* xPow = aec->xPow;
|
||||
float fac1 = 1e-10f;
|
||||
float err_th2 = error_threshold * error_threshold;
|
||||
float f0, f1, f2;
|
||||
#if !defined(MIPS32_R2_LE)
|
||||
float f3;
|
||||
#endif
|
||||
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"1: \n\t"
|
||||
"lwc1 %[f0], 0(%[xPow]) \n\t"
|
||||
"lwc1 %[f1], 0(%[ef0]) \n\t"
|
||||
"lwc1 %[f2], 0(%[ef1]) \n\t"
|
||||
"add.s %[f0], %[f0], %[fac1] \n\t"
|
||||
"div.s %[f1], %[f1], %[f0] \n\t"
|
||||
"div.s %[f2], %[f2], %[f0] \n\t"
|
||||
"mul.s %[f0], %[f1], %[f1] \n\t"
|
||||
#if defined(MIPS32_R2_LE)
|
||||
"madd.s %[f0], %[f0], %[f2], %[f2] \n\t"
|
||||
#else
|
||||
"mul.s %[f3], %[f2], %[f2] \n\t"
|
||||
"add.s %[f0], %[f0], %[f3] \n\t"
|
||||
#endif
|
||||
"c.le.s %[f0], %[err_th2] \n\t"
|
||||
"nop \n\t"
|
||||
"bc1t 2f \n\t"
|
||||
" nop \n\t"
|
||||
"sqrt.s %[f0], %[f0] \n\t"
|
||||
"add.s %[f0], %[f0], %[fac1] \n\t"
|
||||
"div.s %[f0], %[err_th], %[f0] \n\t"
|
||||
"mul.s %[f1], %[f1], %[f0] \n\t"
|
||||
"mul.s %[f2], %[f2], %[f0] \n\t"
|
||||
"2: \n\t"
|
||||
"mul.s %[f1], %[f1], %[mu] \n\t"
|
||||
"mul.s %[f2], %[f2], %[mu] \n\t"
|
||||
"swc1 %[f1], 0(%[ef0]) \n\t"
|
||||
"swc1 %[f2], 0(%[ef1]) \n\t"
|
||||
"addiu %[len], %[len], -1 \n\t"
|
||||
"addiu %[xPow], %[xPow], 4 \n\t"
|
||||
"addiu %[ef0], %[ef0], 4 \n\t"
|
||||
"bgtz %[len], 1b \n\t"
|
||||
" addiu %[ef1], %[ef1], 4 \n\t"
|
||||
".set pop \n\t"
|
||||
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
|
||||
#if !defined(MIPS32_R2_LE)
|
||||
[f3] "=&f" (f3),
|
||||
#endif
|
||||
[xPow] "+r" (xPow), [ef0] "+r" (ef0), [ef1] "+r" (ef1),
|
||||
[len] "+r" (len)
|
||||
: [fac1] "f" (fac1), [err_th2] "f" (err_th2), [mu] "f" (mu),
|
||||
[err_th] "f" (error_threshold)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
void WebRtcAec_InitAec_mips(void) {
|
||||
WebRtcAec_FilterFar = WebRtcAec_FilterFar_mips;
|
||||
WebRtcAec_FilterAdaptation = WebRtcAec_FilterAdaptation_mips;
|
||||
WebRtcAec_ScaleErrorSignal = WebRtcAec_ScaleErrorSignal_mips;
|
||||
WebRtcAec_ComfortNoise = WebRtcAec_ComfortNoise_mips;
|
||||
WebRtcAec_OverdriveAndSuppress = WebRtcAec_OverdriveAndSuppress_mips;
|
||||
}
|
||||
|
736
webrtc/modules/audio_processing/aec/aec_core_neon.c
Normal file
736
webrtc/modules/audio_processing/aec/aec_core_neon.c
Normal file
@ -0,0 +1,736 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The core AEC algorithm, neon version of speed-critical functions.
|
||||
*
|
||||
* Based on aec_core_sse2.c.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <math.h>
|
||||
#include <string.h> // memset
|
||||
|
||||
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_common.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
|
||||
|
||||
enum { kShiftExponentIntoTopMantissa = 8 };
|
||||
enum { kFloatExponentShift = 23 };
|
||||
|
||||
__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
|
||||
return aRe * bRe - aIm * bIm;
|
||||
}
|
||||
|
||||
__inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
|
||||
return aRe * bIm + aIm * bRe;
|
||||
}
|
||||
|
||||
static void FilterFarNEON(AecCore* aec, float yf[2][PART_LEN1]) {
|
||||
int i;
|
||||
const int num_partitions = aec->num_partitions;
|
||||
for (i = 0; i < num_partitions; i++) {
|
||||
int j;
|
||||
int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
|
||||
int pos = i * PART_LEN1;
|
||||
// Check for wrap
|
||||
if (i + aec->xfBufBlockPos >= num_partitions) {
|
||||
xPos -= num_partitions * PART_LEN1;
|
||||
}
|
||||
|
||||
// vectorized code (four at once)
|
||||
for (j = 0; j + 3 < PART_LEN1; j += 4) {
|
||||
const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);
|
||||
const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);
|
||||
const float32x4_t wfBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);
|
||||
const float32x4_t wfBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);
|
||||
const float32x4_t yf_re = vld1q_f32(&yf[0][j]);
|
||||
const float32x4_t yf_im = vld1q_f32(&yf[1][j]);
|
||||
const float32x4_t a = vmulq_f32(xfBuf_re, wfBuf_re);
|
||||
const float32x4_t e = vmlsq_f32(a, xfBuf_im, wfBuf_im);
|
||||
const float32x4_t c = vmulq_f32(xfBuf_re, wfBuf_im);
|
||||
const float32x4_t f = vmlaq_f32(c, xfBuf_im, wfBuf_re);
|
||||
const float32x4_t g = vaddq_f32(yf_re, e);
|
||||
const float32x4_t h = vaddq_f32(yf_im, f);
|
||||
vst1q_f32(&yf[0][j], g);
|
||||
vst1q_f32(&yf[1][j], h);
|
||||
}
|
||||
// scalar code for the remaining items.
|
||||
for (; j < PART_LEN1; j++) {
|
||||
yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],
|
||||
aec->xfBuf[1][xPos + j],
|
||||
aec->wfBuf[0][pos + j],
|
||||
aec->wfBuf[1][pos + j]);
|
||||
yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],
|
||||
aec->xfBuf[1][xPos + j],
|
||||
aec->wfBuf[0][pos + j],
|
||||
aec->wfBuf[1][pos + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ARM64's arm_neon.h has already defined vdivq_f32 vsqrtq_f32.
|
||||
#if !defined (WEBRTC_ARCH_ARM64)
|
||||
static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) {
|
||||
int i;
|
||||
float32x4_t x = vrecpeq_f32(b);
|
||||
// from arm documentation
|
||||
// The Newton-Raphson iteration:
|
||||
// x[n+1] = x[n] * (2 - d * x[n])
|
||||
// converges to (1/d) if x0 is the result of VRECPE applied to d.
|
||||
//
|
||||
// Note: The precision did not improve after 2 iterations.
|
||||
for (i = 0; i < 2; i++) {
|
||||
x = vmulq_f32(vrecpsq_f32(b, x), x);
|
||||
}
|
||||
// a/b = a*(1/b)
|
||||
return vmulq_f32(a, x);
|
||||
}
|
||||
|
||||
static float32x4_t vsqrtq_f32(float32x4_t s) {
|
||||
int i;
|
||||
float32x4_t x = vrsqrteq_f32(s);
|
||||
|
||||
// Code to handle sqrt(0).
|
||||
// If the input to sqrtf() is zero, a zero will be returned.
|
||||
// If the input to vrsqrteq_f32() is zero, positive infinity is returned.
|
||||
const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);
|
||||
// check for divide by zero
|
||||
const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x));
|
||||
// zero out the positive infinity results
|
||||
x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero),
|
||||
vreinterpretq_u32_f32(x)));
|
||||
// from arm documentation
|
||||
// The Newton-Raphson iteration:
|
||||
// x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2)
|
||||
// converges to (1/√d) if x0 is the result of VRSQRTE applied to d.
|
||||
//
|
||||
// Note: The precision did not improve after 2 iterations.
|
||||
for (i = 0; i < 2; i++) {
|
||||
x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x);
|
||||
}
|
||||
// sqrt(s) = s * 1/sqrt(s)
|
||||
return vmulq_f32(s, x);;
|
||||
}
|
||||
#endif // WEBRTC_ARCH_ARM64
|
||||
|
||||
static void ScaleErrorSignalNEON(AecCore* aec, float ef[2][PART_LEN1]) {
|
||||
const float mu = aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
|
||||
const float error_threshold = aec->extended_filter_enabled ?
|
||||
kExtendedErrorThreshold : aec->normal_error_threshold;
|
||||
const float32x4_t k1e_10f = vdupq_n_f32(1e-10f);
|
||||
const float32x4_t kMu = vmovq_n_f32(mu);
|
||||
const float32x4_t kThresh = vmovq_n_f32(error_threshold);
|
||||
int i;
|
||||
// vectorized code (four at once)
|
||||
for (i = 0; i + 3 < PART_LEN1; i += 4) {
|
||||
const float32x4_t xPow = vld1q_f32(&aec->xPow[i]);
|
||||
const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]);
|
||||
const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]);
|
||||
const float32x4_t xPowPlus = vaddq_f32(xPow, k1e_10f);
|
||||
float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus);
|
||||
float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus);
|
||||
const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re);
|
||||
const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im);
|
||||
const float32x4_t absEf = vsqrtq_f32(ef_sum2);
|
||||
const uint32x4_t bigger = vcgtq_f32(absEf, kThresh);
|
||||
const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f);
|
||||
const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus);
|
||||
uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv));
|
||||
uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv));
|
||||
uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger),
|
||||
vreinterpretq_u32_f32(ef_re));
|
||||
uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger),
|
||||
vreinterpretq_u32_f32(ef_im));
|
||||
ef_re_if = vandq_u32(bigger, ef_re_if);
|
||||
ef_im_if = vandq_u32(bigger, ef_im_if);
|
||||
ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if);
|
||||
ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if);
|
||||
ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu);
|
||||
ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu);
|
||||
vst1q_f32(&ef[0][i], ef_re);
|
||||
vst1q_f32(&ef[1][i], ef_im);
|
||||
}
|
||||
// scalar code for the remaining items.
|
||||
for (; i < PART_LEN1; i++) {
|
||||
float abs_ef;
|
||||
ef[0][i] /= (aec->xPow[i] + 1e-10f);
|
||||
ef[1][i] /= (aec->xPow[i] + 1e-10f);
|
||||
abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
|
||||
|
||||
if (abs_ef > error_threshold) {
|
||||
abs_ef = error_threshold / (abs_ef + 1e-10f);
|
||||
ef[0][i] *= abs_ef;
|
||||
ef[1][i] *= abs_ef;
|
||||
}
|
||||
|
||||
// Stepsize factor
|
||||
ef[0][i] *= mu;
|
||||
ef[1][i] *= mu;
|
||||
}
|
||||
}
|
||||
|
||||
static void FilterAdaptationNEON(AecCore* aec,
|
||||
float* fft,
|
||||
float ef[2][PART_LEN1]) {
|
||||
int i;
|
||||
const int num_partitions = aec->num_partitions;
|
||||
for (i = 0; i < num_partitions; i++) {
|
||||
int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
|
||||
int pos = i * PART_LEN1;
|
||||
int j;
|
||||
// Check for wrap
|
||||
if (i + aec->xfBufBlockPos >= num_partitions) {
|
||||
xPos -= num_partitions * PART_LEN1;
|
||||
}
|
||||
|
||||
// Process the whole array...
|
||||
for (j = 0; j < PART_LEN; j += 4) {
|
||||
// Load xfBuf and ef.
|
||||
const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);
|
||||
const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);
|
||||
const float32x4_t ef_re = vld1q_f32(&ef[0][j]);
|
||||
const float32x4_t ef_im = vld1q_f32(&ef[1][j]);
|
||||
// Calculate the product of conjugate(xfBuf) by ef.
|
||||
// re(conjugate(a) * b) = aRe * bRe + aIm * bIm
|
||||
// im(conjugate(a) * b)= aRe * bIm - aIm * bRe
|
||||
const float32x4_t a = vmulq_f32(xfBuf_re, ef_re);
|
||||
const float32x4_t e = vmlaq_f32(a, xfBuf_im, ef_im);
|
||||
const float32x4_t c = vmulq_f32(xfBuf_re, ef_im);
|
||||
const float32x4_t f = vmlsq_f32(c, xfBuf_im, ef_re);
|
||||
// Interleave real and imaginary parts.
|
||||
const float32x4x2_t g_n_h = vzipq_f32(e, f);
|
||||
// Store
|
||||
vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]);
|
||||
vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);
|
||||
}
|
||||
// ... and fixup the first imaginary entry.
|
||||
fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
|
||||
-aec->xfBuf[1][xPos + PART_LEN],
|
||||
ef[0][PART_LEN],
|
||||
ef[1][PART_LEN]);
|
||||
|
||||
aec_rdft_inverse_128(fft);
|
||||
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
|
||||
|
||||
// fft scaling
|
||||
{
|
||||
const float scale = 2.0f / PART_LEN2;
|
||||
const float32x4_t scale_ps = vmovq_n_f32(scale);
|
||||
for (j = 0; j < PART_LEN; j += 4) {
|
||||
const float32x4_t fft_ps = vld1q_f32(&fft[j]);
|
||||
const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps);
|
||||
vst1q_f32(&fft[j], fft_scale);
|
||||
}
|
||||
}
|
||||
aec_rdft_forward_128(fft);
|
||||
|
||||
{
|
||||
const float wt1 = aec->wfBuf[1][pos];
|
||||
aec->wfBuf[0][pos + PART_LEN] += fft[1];
|
||||
for (j = 0; j < PART_LEN; j += 4) {
|
||||
float32x4_t wtBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);
|
||||
float32x4_t wtBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);
|
||||
const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]);
|
||||
const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]);
|
||||
const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4);
|
||||
wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]);
|
||||
wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]);
|
||||
|
||||
vst1q_f32(&aec->wfBuf[0][pos + j], wtBuf_re);
|
||||
vst1q_f32(&aec->wfBuf[1][pos + j], wtBuf_im);
|
||||
}
|
||||
aec->wfBuf[1][pos] = wt1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
|
||||
// a^b = exp2(b * log2(a))
|
||||
// exp2(x) and log2(x) are calculated using polynomial approximations.
|
||||
float32x4_t log2_a, b_log2_a, a_exp_b;
|
||||
|
||||
// Calculate log2(x), x = a.
|
||||
{
|
||||
// To calculate log2(x), we decompose x like this:
|
||||
// x = y * 2^n
|
||||
// n is an integer
|
||||
// y is in the [1.0, 2.0) range
|
||||
//
|
||||
// log2(x) = log2(y) + n
|
||||
// n can be evaluated by playing with float representation.
|
||||
// log2(y) in a small range can be approximated, this code uses an order
|
||||
// five polynomial approximation. The coefficients have been
|
||||
// estimated with the Remez algorithm and the resulting
|
||||
// polynomial has a maximum relative error of 0.00086%.
|
||||
|
||||
// Compute n.
|
||||
// This is done by masking the exponent, shifting it into the top bit of
|
||||
// the mantissa, putting eight into the biased exponent (to shift/
|
||||
// compensate the fact that the exponent has been shifted in the top/
|
||||
// fractional part and finally getting rid of the implicit leading one
|
||||
// from the mantissa by substracting it out.
|
||||
const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000);
|
||||
const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000);
|
||||
const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000);
|
||||
const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a),
|
||||
vec_float_exponent_mask);
|
||||
const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa);
|
||||
const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent);
|
||||
const float32x4_t n =
|
||||
vsubq_f32(vreinterpretq_f32_u32(n_0),
|
||||
vreinterpretq_f32_u32(vec_implicit_leading_one));
|
||||
// Compute y.
|
||||
const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF);
|
||||
const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000);
|
||||
const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a),
|
||||
vec_mantissa_mask);
|
||||
const float32x4_t y =
|
||||
vreinterpretq_f32_u32(vorrq_u32(mantissa,
|
||||
vec_zero_biased_exponent_is_one));
|
||||
// Approximate log2(y) ~= (y - 1) * pol5(y).
|
||||
// pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
|
||||
const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f);
|
||||
const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f);
|
||||
const float32x4_t C3 = vdupq_n_f32(-1.2315303f);
|
||||
const float32x4_t C2 = vdupq_n_f32(2.5988452f);
|
||||
const float32x4_t C1 = vdupq_n_f32(-3.3241990f);
|
||||
const float32x4_t C0 = vdupq_n_f32(3.1157899f);
|
||||
float32x4_t pol5_y = C5;
|
||||
pol5_y = vmlaq_f32(C4, y, pol5_y);
|
||||
pol5_y = vmlaq_f32(C3, y, pol5_y);
|
||||
pol5_y = vmlaq_f32(C2, y, pol5_y);
|
||||
pol5_y = vmlaq_f32(C1, y, pol5_y);
|
||||
pol5_y = vmlaq_f32(C0, y, pol5_y);
|
||||
const float32x4_t y_minus_one =
|
||||
vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one));
|
||||
const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y);
|
||||
|
||||
// Combine parts.
|
||||
log2_a = vaddq_f32(n, log2_y);
|
||||
}
|
||||
|
||||
// b * log2(a)
|
||||
b_log2_a = vmulq_f32(b, log2_a);
|
||||
|
||||
// Calculate exp2(x), x = b * log2(a).
|
||||
{
|
||||
// To calculate 2^x, we decompose x like this:
|
||||
// x = n + y
|
||||
// n is an integer, the value of x - 0.5 rounded down, therefore
|
||||
// y is in the [0.5, 1.5) range
|
||||
//
|
||||
// 2^x = 2^n * 2^y
|
||||
// 2^n can be evaluated by playing with float representation.
|
||||
// 2^y in a small range can be approximated, this code uses an order two
|
||||
// polynomial approximation. The coefficients have been estimated
|
||||
// with the Remez algorithm and the resulting polynomial has a
|
||||
// maximum relative error of 0.17%.
|
||||
// To avoid over/underflow, we reduce the range of input to ]-127, 129].
|
||||
const float32x4_t max_input = vdupq_n_f32(129.f);
|
||||
const float32x4_t min_input = vdupq_n_f32(-126.99999f);
|
||||
const float32x4_t x_min = vminq_f32(b_log2_a, max_input);
|
||||
const float32x4_t x_max = vmaxq_f32(x_min, min_input);
|
||||
// Compute n.
|
||||
const float32x4_t half = vdupq_n_f32(0.5f);
|
||||
const float32x4_t x_minus_half = vsubq_f32(x_max, half);
|
||||
const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half);
|
||||
|
||||
// Compute 2^n.
|
||||
const int32x4_t float_exponent_bias = vdupq_n_s32(127);
|
||||
const int32x4_t two_n_exponent =
|
||||
vaddq_s32(x_minus_half_floor, float_exponent_bias);
|
||||
const float32x4_t two_n =
|
||||
vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift));
|
||||
// Compute y.
|
||||
const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor));
|
||||
|
||||
// Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
|
||||
const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f);
|
||||
const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f);
|
||||
const float32x4_t C0 = vdupq_n_f32(1.0017247f);
|
||||
float32x4_t exp2_y = C2;
|
||||
exp2_y = vmlaq_f32(C1, y, exp2_y);
|
||||
exp2_y = vmlaq_f32(C0, y, exp2_y);
|
||||
|
||||
// Combine parts.
|
||||
a_exp_b = vmulq_f32(exp2_y, two_n);
|
||||
}
|
||||
|
||||
return a_exp_b;
|
||||
}
|
||||
|
||||
static void OverdriveAndSuppressNEON(AecCore* aec,
|
||||
float hNl[PART_LEN1],
|
||||
const float hNlFb,
|
||||
float efw[2][PART_LEN1]) {
|
||||
int i;
|
||||
const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb);
|
||||
const float32x4_t vec_one = vdupq_n_f32(1.0f);
|
||||
const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f);
|
||||
const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm);
|
||||
|
||||
// vectorized code (four at once)
|
||||
for (i = 0; i + 3 < PART_LEN1; i += 4) {
|
||||
// Weight subbands
|
||||
float32x4_t vec_hNl = vld1q_f32(&hNl[i]);
|
||||
const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]);
|
||||
const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb);
|
||||
const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve,
|
||||
vec_hNlFb);
|
||||
const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve);
|
||||
const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve,
|
||||
vec_hNl);
|
||||
const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger),
|
||||
vreinterpretq_u32_f32(vec_hNl));
|
||||
const float32x4_t vec_one_weightCurve_add =
|
||||
vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl);
|
||||
const uint32x4_t vec_if1 =
|
||||
vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add));
|
||||
|
||||
vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1));
|
||||
|
||||
{
|
||||
const float32x4_t vec_overDriveCurve =
|
||||
vld1q_f32(&WebRtcAec_overDriveCurve[i]);
|
||||
const float32x4_t vec_overDriveSm_overDriveCurve =
|
||||
vmulq_f32(vec_overDriveSm, vec_overDriveCurve);
|
||||
vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve);
|
||||
vst1q_f32(&hNl[i], vec_hNl);
|
||||
}
|
||||
|
||||
// Suppress error signal
|
||||
{
|
||||
float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]);
|
||||
float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]);
|
||||
vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl);
|
||||
vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl);
|
||||
|
||||
// Ooura fft returns incorrect sign on imaginary component. It matters
|
||||
// here because we are making an additive change with comfort noise.
|
||||
vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one);
|
||||
vst1q_f32(&efw[0][i], vec_efw_re);
|
||||
vst1q_f32(&efw[1][i], vec_efw_im);
|
||||
}
|
||||
}
|
||||
|
||||
// scalar code for the remaining items.
|
||||
for (; i < PART_LEN1; i++) {
|
||||
// Weight subbands
|
||||
if (hNl[i] > hNlFb) {
|
||||
hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
|
||||
(1 - WebRtcAec_weightCurve[i]) * hNl[i];
|
||||
}
|
||||
|
||||
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
|
||||
|
||||
// Suppress error signal
|
||||
efw[0][i] *= hNl[i];
|
||||
efw[1][i] *= hNl[i];
|
||||
|
||||
// Ooura fft returns incorrect sign on imaginary component. It matters
|
||||
// here because we are making an additive change with comfort noise.
|
||||
efw[1][i] *= -1;
|
||||
}
|
||||
}
|
||||
|
||||
static int PartitionDelay(const AecCore* aec) {
|
||||
// Measures the energy in each filter partition and returns the partition with
|
||||
// highest energy.
|
||||
// TODO(bjornv): Spread computational cost by computing one partition per
|
||||
// block?
|
||||
float wfEnMax = 0;
|
||||
int i;
|
||||
int delay = 0;
|
||||
|
||||
for (i = 0; i < aec->num_partitions; i++) {
|
||||
int j;
|
||||
int pos = i * PART_LEN1;
|
||||
float wfEn = 0;
|
||||
float32x4_t vec_wfEn = vdupq_n_f32(0.0f);
|
||||
// vectorized code (four at once)
|
||||
for (j = 0; j + 3 < PART_LEN1; j += 4) {
|
||||
const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]);
|
||||
const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]);
|
||||
vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0);
|
||||
vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1);
|
||||
}
|
||||
{
|
||||
float32x2_t vec_total;
|
||||
// A B C D
|
||||
vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn));
|
||||
// A+B C+D
|
||||
vec_total = vpadd_f32(vec_total, vec_total);
|
||||
// A+B+C+D A+B+C+D
|
||||
wfEn = vget_lane_f32(vec_total, 0);
|
||||
}
|
||||
|
||||
// scalar code for the remaining items.
|
||||
for (; j < PART_LEN1; j++) {
|
||||
wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
|
||||
aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
|
||||
}
|
||||
|
||||
if (wfEn > wfEnMax) {
|
||||
wfEnMax = wfEn;
|
||||
delay = i;
|
||||
}
|
||||
}
|
||||
return delay;
|
||||
}
|
||||
|
||||
// Updates the following smoothed Power Spectral Densities (PSD):
|
||||
// - sd : near-end
|
||||
// - se : residual echo
|
||||
// - sx : far-end
|
||||
// - sde : cross-PSD of near-end and residual echo
|
||||
// - sxd : cross-PSD of near-end and far-end
|
||||
//
|
||||
// In addition to updating the PSDs, also the filter diverge state is determined
|
||||
// upon actions are taken.
|
||||
static void SmoothedPSD(AecCore* aec,
|
||||
float efw[2][PART_LEN1],
|
||||
float dfw[2][PART_LEN1],
|
||||
float xfw[2][PART_LEN1]) {
|
||||
// Power estimate smoothing coefficients.
|
||||
const float* ptrGCoh = aec->extended_filter_enabled
|
||||
? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
|
||||
: WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
|
||||
int i;
|
||||
float sdSum = 0, seSum = 0;
|
||||
const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD);
|
||||
float32x4_t vec_sdSum = vdupq_n_f32(0.0f);
|
||||
float32x4_t vec_seSum = vdupq_n_f32(0.0f);
|
||||
|
||||
for (i = 0; i + 3 < PART_LEN1; i += 4) {
|
||||
const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]);
|
||||
const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]);
|
||||
const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]);
|
||||
const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]);
|
||||
const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]);
|
||||
const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]);
|
||||
float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]);
|
||||
float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]);
|
||||
float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]);
|
||||
float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0);
|
||||
float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0);
|
||||
float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0);
|
||||
|
||||
vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1);
|
||||
vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1);
|
||||
vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1);
|
||||
vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15);
|
||||
vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]);
|
||||
vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]);
|
||||
vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]);
|
||||
|
||||
vst1q_f32(&aec->sd[i], vec_sd);
|
||||
vst1q_f32(&aec->se[i], vec_se);
|
||||
vst1q_f32(&aec->sx[i], vec_sx);
|
||||
|
||||
{
|
||||
float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
|
||||
float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0);
|
||||
float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1);
|
||||
vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]);
|
||||
vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]);
|
||||
vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1);
|
||||
vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0);
|
||||
vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]);
|
||||
vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]);
|
||||
vst2q_f32(&aec->sde[i][0], vec_sde);
|
||||
}
|
||||
|
||||
{
|
||||
float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
|
||||
float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0);
|
||||
float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1);
|
||||
vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]);
|
||||
vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]);
|
||||
vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1);
|
||||
vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0);
|
||||
vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]);
|
||||
vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]);
|
||||
vst2q_f32(&aec->sxd[i][0], vec_sxd);
|
||||
}
|
||||
|
||||
vec_sdSum = vaddq_f32(vec_sdSum, vec_sd);
|
||||
vec_seSum = vaddq_f32(vec_seSum, vec_se);
|
||||
}
|
||||
{
|
||||
float32x2_t vec_sdSum_total;
|
||||
float32x2_t vec_seSum_total;
|
||||
// A B C D
|
||||
vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum),
|
||||
vget_high_f32(vec_sdSum));
|
||||
vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum),
|
||||
vget_high_f32(vec_seSum));
|
||||
// A+B C+D
|
||||
vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total);
|
||||
vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total);
|
||||
// A+B+C+D A+B+C+D
|
||||
sdSum = vget_lane_f32(vec_sdSum_total, 0);
|
||||
seSum = vget_lane_f32(vec_seSum_total, 0);
|
||||
}
|
||||
|
||||
// scalar code for the remaining items.
|
||||
for (; i < PART_LEN1; i++) {
|
||||
aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
|
||||
ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
|
||||
aec->se[i] = ptrGCoh[0] * aec->se[i] +
|
||||
ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
|
||||
// We threshold here to protect against the ill-effects of a zero farend.
|
||||
// The threshold is not arbitrarily chosen, but balances protection and
|
||||
// adverse interaction with the algorithm's tuning.
|
||||
// TODO(bjornv): investigate further why this is so sensitive.
|
||||
aec->sx[i] =
|
||||
ptrGCoh[0] * aec->sx[i] +
|
||||
ptrGCoh[1] * WEBRTC_SPL_MAX(
|
||||
xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
|
||||
WebRtcAec_kMinFarendPSD);
|
||||
|
||||
aec->sde[i][0] =
|
||||
ptrGCoh[0] * aec->sde[i][0] +
|
||||
ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
|
||||
aec->sde[i][1] =
|
||||
ptrGCoh[0] * aec->sde[i][1] +
|
||||
ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
|
||||
|
||||
aec->sxd[i][0] =
|
||||
ptrGCoh[0] * aec->sxd[i][0] +
|
||||
ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
|
||||
aec->sxd[i][1] =
|
||||
ptrGCoh[0] * aec->sxd[i][1] +
|
||||
ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
|
||||
|
||||
sdSum += aec->sd[i];
|
||||
seSum += aec->se[i];
|
||||
}
|
||||
|
||||
// Divergent filter safeguard.
|
||||
aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;
|
||||
|
||||
if (aec->divergeState)
|
||||
memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1);
|
||||
|
||||
// Reset if error is significantly larger than nearend (13 dB).
|
||||
if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum))
|
||||
memset(aec->wfBuf, 0, sizeof(aec->wfBuf));
|
||||
}
|
||||
|
||||
// Window time domain data to be used by the fft.
|
||||
__inline static void WindowData(float* x_windowed, const float* x) {
|
||||
int i;
|
||||
for (i = 0; i < PART_LEN; i += 4) {
|
||||
const float32x4_t vec_Buf1 = vld1q_f32(&x[i]);
|
||||
const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]);
|
||||
const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]);
|
||||
// A B C D
|
||||
float32x4_t vec_sqrtHanning_rev =
|
||||
vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
|
||||
// B A D C
|
||||
vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev);
|
||||
// D C B A
|
||||
vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev),
|
||||
vget_low_f32(vec_sqrtHanning_rev));
|
||||
vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning));
|
||||
vst1q_f32(&x_windowed[PART_LEN + i],
|
||||
vmulq_f32(vec_Buf2, vec_sqrtHanning_rev));
|
||||
}
|
||||
}
|
||||
|
||||
// Puts fft output data into a complex valued array.
|
||||
__inline static void StoreAsComplex(const float* data,
|
||||
float data_complex[2][PART_LEN1]) {
|
||||
int i;
|
||||
for (i = 0; i < PART_LEN; i += 4) {
|
||||
const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]);
|
||||
vst1q_f32(&data_complex[0][i], vec_data.val[0]);
|
||||
vst1q_f32(&data_complex[1][i], vec_data.val[1]);
|
||||
}
|
||||
// fix beginning/end values
|
||||
data_complex[1][0] = 0;
|
||||
data_complex[1][PART_LEN] = 0;
|
||||
data_complex[0][0] = data[0];
|
||||
data_complex[0][PART_LEN] = data[1];
|
||||
}
|
||||
|
||||
static void SubbandCoherenceNEON(AecCore* aec,
|
||||
float efw[2][PART_LEN1],
|
||||
float xfw[2][PART_LEN1],
|
||||
float* fft,
|
||||
float* cohde,
|
||||
float* cohxd) {
|
||||
float dfw[2][PART_LEN1];
|
||||
int i;
|
||||
|
||||
if (aec->delayEstCtr == 0)
|
||||
aec->delayIdx = PartitionDelay(aec);
|
||||
|
||||
// Use delayed far.
|
||||
memcpy(xfw,
|
||||
aec->xfwBuf + aec->delayIdx * PART_LEN1,
|
||||
sizeof(xfw[0][0]) * 2 * PART_LEN1);
|
||||
|
||||
// Windowed near fft
|
||||
WindowData(fft, aec->dBuf);
|
||||
aec_rdft_forward_128(fft);
|
||||
StoreAsComplex(fft, dfw);
|
||||
|
||||
// Windowed error fft
|
||||
WindowData(fft, aec->eBuf);
|
||||
aec_rdft_forward_128(fft);
|
||||
StoreAsComplex(fft, efw);
|
||||
|
||||
SmoothedPSD(aec, efw, dfw, xfw);
|
||||
|
||||
{
|
||||
const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f);
|
||||
|
||||
// Subband coherence
|
||||
for (i = 0; i + 3 < PART_LEN1; i += 4) {
|
||||
const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]);
|
||||
const float32x4_t vec_se = vld1q_f32(&aec->se[i]);
|
||||
const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]);
|
||||
const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se);
|
||||
const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx);
|
||||
float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
|
||||
float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
|
||||
float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]);
|
||||
float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]);
|
||||
vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]);
|
||||
vec_cohde = vdivq_f32(vec_cohde, vec_sdse);
|
||||
vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]);
|
||||
vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx);
|
||||
|
||||
vst1q_f32(&cohde[i], vec_cohde);
|
||||
vst1q_f32(&cohxd[i], vec_cohxd);
|
||||
}
|
||||
}
|
||||
// scalar code for the remaining items.
|
||||
for (; i < PART_LEN1; i++) {
|
||||
cohde[i] =
|
||||
(aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
|
||||
(aec->sd[i] * aec->se[i] + 1e-10f);
|
||||
cohxd[i] =
|
||||
(aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
|
||||
(aec->sx[i] * aec->sd[i] + 1e-10f);
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAec_InitAec_neon(void) {
|
||||
WebRtcAec_FilterFar = FilterFarNEON;
|
||||
WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON;
|
||||
WebRtcAec_FilterAdaptation = FilterAdaptationNEON;
|
||||
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON;
|
||||
WebRtcAec_SubbandCoherence = SubbandCoherenceNEON;
|
||||
}
|
||||
|
@ -12,35 +12,33 @@
|
||||
* The core AEC algorithm, SSE2 version of speed-critical functions.
|
||||
*/
|
||||
|
||||
#include "typedefs.h"
|
||||
|
||||
#if defined(WEBRTC_USE_SSE2)
|
||||
#include <emmintrin.h>
|
||||
#include <math.h>
|
||||
#include <string.h> // memset
|
||||
|
||||
#include "aec_core.h"
|
||||
#include "aec_rdft.h"
|
||||
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_common.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
|
||||
|
||||
__inline static float MulRe(float aRe, float aIm, float bRe, float bIm)
|
||||
{
|
||||
__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
|
||||
return aRe * bRe - aIm * bIm;
|
||||
}
|
||||
|
||||
__inline static float MulIm(float aRe, float aIm, float bRe, float bIm)
|
||||
{
|
||||
__inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
|
||||
return aRe * bIm + aIm * bRe;
|
||||
}
|
||||
|
||||
static void FilterFarSSE2(aec_t *aec, float yf[2][PART_LEN1])
|
||||
{
|
||||
static void FilterFarSSE2(AecCore* aec, float yf[2][PART_LEN1]) {
|
||||
int i;
|
||||
for (i = 0; i < NR_PART; i++) {
|
||||
const int num_partitions = aec->num_partitions;
|
||||
for (i = 0; i < num_partitions; i++) {
|
||||
int j;
|
||||
int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
|
||||
int pos = i * PART_LEN1;
|
||||
// Check for wrap
|
||||
if (i + aec->xfBufBlockPos >= NR_PART) {
|
||||
xPos -= NR_PART*(PART_LEN1);
|
||||
if (i + aec->xfBufBlockPos >= num_partitions) {
|
||||
xPos -= num_partitions * (PART_LEN1);
|
||||
}
|
||||
|
||||
// vectorized code (four at once)
|
||||
@ -64,19 +62,25 @@ static void FilterFarSSE2(aec_t *aec, float yf[2][PART_LEN1])
|
||||
}
|
||||
// scalar code for the remaining items.
|
||||
for (; j < PART_LEN1; j++) {
|
||||
yf[0][j] += MulRe(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j],
|
||||
aec->wfBuf[0][ pos + j], aec->wfBuf[1][ pos + j]);
|
||||
yf[1][j] += MulIm(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j],
|
||||
aec->wfBuf[0][ pos + j], aec->wfBuf[1][ pos + j]);
|
||||
yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],
|
||||
aec->xfBuf[1][xPos + j],
|
||||
aec->wfBuf[0][pos + j],
|
||||
aec->wfBuf[1][pos + j]);
|
||||
yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],
|
||||
aec->xfBuf[1][xPos + j],
|
||||
aec->wfBuf[0][pos + j],
|
||||
aec->wfBuf[1][pos + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
|
||||
{
|
||||
static void ScaleErrorSignalSSE2(AecCore* aec, float ef[2][PART_LEN1]) {
|
||||
const __m128 k1e_10f = _mm_set1_ps(1e-10f);
|
||||
const __m128 kThresh = _mm_set1_ps(aec->errThresh);
|
||||
const __m128 kMu = _mm_set1_ps(aec->mu);
|
||||
const __m128 kMu = aec->extended_filter_enabled ? _mm_set1_ps(kExtendedMu)
|
||||
: _mm_set1_ps(aec->normal_mu);
|
||||
const __m128 kThresh = aec->extended_filter_enabled
|
||||
? _mm_set1_ps(kExtendedErrorThreshold)
|
||||
: _mm_set1_ps(aec->normal_error_threshold);
|
||||
|
||||
int i;
|
||||
// vectorized code (four at once)
|
||||
@ -110,36 +114,46 @@ static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
|
||||
_mm_storeu_ps(&ef[1][i], ef_im);
|
||||
}
|
||||
// scalar code for the remaining items.
|
||||
for (; i < (PART_LEN1); i++) {
|
||||
float absEf;
|
||||
ef[0][i] /= (aec->xPow[i] + 1e-10f);
|
||||
ef[1][i] /= (aec->xPow[i] + 1e-10f);
|
||||
absEf = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
|
||||
{
|
||||
const float mu =
|
||||
aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
|
||||
const float error_threshold = aec->extended_filter_enabled
|
||||
? kExtendedErrorThreshold
|
||||
: aec->normal_error_threshold;
|
||||
for (; i < (PART_LEN1); i++) {
|
||||
float abs_ef;
|
||||
ef[0][i] /= (aec->xPow[i] + 1e-10f);
|
||||
ef[1][i] /= (aec->xPow[i] + 1e-10f);
|
||||
abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
|
||||
|
||||
if (absEf > aec->errThresh) {
|
||||
absEf = aec->errThresh / (absEf + 1e-10f);
|
||||
ef[0][i] *= absEf;
|
||||
ef[1][i] *= absEf;
|
||||
if (abs_ef > error_threshold) {
|
||||
abs_ef = error_threshold / (abs_ef + 1e-10f);
|
||||
ef[0][i] *= abs_ef;
|
||||
ef[1][i] *= abs_ef;
|
||||
}
|
||||
|
||||
// Stepsize factor
|
||||
ef[0][i] *= mu;
|
||||
ef[1][i] *= mu;
|
||||
}
|
||||
|
||||
// Stepsize factor
|
||||
ef[0][i] *= aec->mu;
|
||||
ef[1][i] *= aec->mu;
|
||||
}
|
||||
}
|
||||
|
||||
static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1]) {
|
||||
static void FilterAdaptationSSE2(AecCore* aec,
|
||||
float* fft,
|
||||
float ef[2][PART_LEN1]) {
|
||||
int i, j;
|
||||
for (i = 0; i < NR_PART; i++) {
|
||||
int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
|
||||
const int num_partitions = aec->num_partitions;
|
||||
for (i = 0; i < num_partitions; i++) {
|
||||
int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1);
|
||||
int pos = i * PART_LEN1;
|
||||
// Check for wrap
|
||||
if (i + aec->xfBufBlockPos >= NR_PART) {
|
||||
xPos -= NR_PART * PART_LEN1;
|
||||
if (i + aec->xfBufBlockPos >= num_partitions) {
|
||||
xPos -= num_partitions * PART_LEN1;
|
||||
}
|
||||
|
||||
// Process the whole array...
|
||||
for (j = 0; j < PART_LEN; j+= 4) {
|
||||
for (j = 0; j < PART_LEN; j += 4) {
|
||||
// Load xfBuf and ef.
|
||||
const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
|
||||
const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
|
||||
@ -158,22 +172,23 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1])
|
||||
const __m128 g = _mm_unpacklo_ps(e, f);
|
||||
const __m128 h = _mm_unpackhi_ps(e, f);
|
||||
// Store
|
||||
_mm_storeu_ps(&fft[2*j + 0], g);
|
||||
_mm_storeu_ps(&fft[2*j + 4], h);
|
||||
_mm_storeu_ps(&fft[2 * j + 0], g);
|
||||
_mm_storeu_ps(&fft[2 * j + 4], h);
|
||||
}
|
||||
// ... and fixup the first imaginary entry.
|
||||
fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
|
||||
-aec->xfBuf[1][xPos + PART_LEN],
|
||||
ef[0][PART_LEN], ef[1][PART_LEN]);
|
||||
ef[0][PART_LEN],
|
||||
ef[1][PART_LEN]);
|
||||
|
||||
aec_rdft_inverse_128(fft);
|
||||
memset(fft + PART_LEN, 0, sizeof(float)*PART_LEN);
|
||||
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
|
||||
|
||||
// fft scaling
|
||||
{
|
||||
float scale = 2.0f / PART_LEN2;
|
||||
const __m128 scale_ps = _mm_load_ps1(&scale);
|
||||
for (j = 0; j < PART_LEN; j+=4) {
|
||||
for (j = 0; j < PART_LEN; j += 4) {
|
||||
const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
|
||||
const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
|
||||
_mm_storeu_ps(&fft[j], fft_scale);
|
||||
@ -184,13 +199,15 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1])
|
||||
{
|
||||
float wt1 = aec->wfBuf[1][pos];
|
||||
aec->wfBuf[0][pos + PART_LEN] += fft[1];
|
||||
for (j = 0; j < PART_LEN; j+= 4) {
|
||||
for (j = 0; j < PART_LEN; j += 4) {
|
||||
__m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
|
||||
__m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
|
||||
const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
|
||||
const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
|
||||
const __m128 fft_re = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2 ,0));
|
||||
const __m128 fft_im = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3 ,1));
|
||||
const __m128 fft_re =
|
||||
_mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
const __m128 fft_im =
|
||||
_mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
|
||||
wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
|
||||
_mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);
|
||||
@ -201,8 +218,7 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1])
|
||||
}
|
||||
}
|
||||
|
||||
static __m128 mm_pow_ps(__m128 a, __m128 b)
|
||||
{
|
||||
static __m128 mm_pow_ps(__m128 a, __m128 b) {
|
||||
// a^b = exp2(b * log2(a))
|
||||
// exp2(x) and log2(x) are calculated using polynomial approximations.
|
||||
__m128 log2_a, b_log2_a, a_exp_b;
|
||||
@ -227,55 +243,55 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
|
||||
// compensate the fact that the exponent has been shifted in the top/
|
||||
// fractional part and finally getting rid of the implicit leading one
|
||||
// from the mantissa by substracting it out.
|
||||
static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END =
|
||||
{0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
|
||||
static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END =
|
||||
{0x43800000, 0x43800000, 0x43800000, 0x43800000};
|
||||
static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END =
|
||||
{0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
|
||||
static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = {
|
||||
0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
|
||||
static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = {
|
||||
0x43800000, 0x43800000, 0x43800000, 0x43800000};
|
||||
static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = {
|
||||
0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
|
||||
static const int shift_exponent_into_top_mantissa = 8;
|
||||
const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask));
|
||||
const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(two_n),
|
||||
shift_exponent_into_top_mantissa));
|
||||
const __m128 n_0 = _mm_or_ps(n_1, *((__m128 *)eight_biased_exponent));
|
||||
const __m128 n = _mm_sub_ps(n_0, *((__m128 *)implicit_leading_one));
|
||||
const __m128 two_n = _mm_and_ps(a, *((__m128*)float_exponent_mask));
|
||||
const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(
|
||||
_mm_castps_si128(two_n), shift_exponent_into_top_mantissa));
|
||||
const __m128 n_0 = _mm_or_ps(n_1, *((__m128*)eight_biased_exponent));
|
||||
const __m128 n = _mm_sub_ps(n_0, *((__m128*)implicit_leading_one));
|
||||
|
||||
// Compute y.
|
||||
static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END =
|
||||
{0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
|
||||
static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END =
|
||||
{0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
|
||||
const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask));
|
||||
const __m128 y = _mm_or_ps(
|
||||
mantissa, *((__m128 *)zero_biased_exponent_is_one));
|
||||
static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = {
|
||||
0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
|
||||
static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {
|
||||
0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
|
||||
const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask));
|
||||
const __m128 y =
|
||||
_mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one));
|
||||
|
||||
// Approximate log2(y) ~= (y - 1) * pol5(y).
|
||||
// pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
|
||||
static const ALIGN16_BEG float ALIGN16_END C5[4] =
|
||||
{-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
|
||||
static const ALIGN16_BEG float ALIGN16_END C4[4] =
|
||||
{3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
|
||||
static const ALIGN16_BEG float ALIGN16_END C3[4] =
|
||||
{-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
|
||||
static const ALIGN16_BEG float ALIGN16_END C2[4] =
|
||||
{2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
|
||||
static const ALIGN16_BEG float ALIGN16_END C1[4] =
|
||||
{-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
|
||||
static const ALIGN16_BEG float ALIGN16_END C0[4] =
|
||||
{3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
|
||||
const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128 *)C5));
|
||||
const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4));
|
||||
static const ALIGN16_BEG float ALIGN16_END C5[4] = {
|
||||
-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
|
||||
static const ALIGN16_BEG float ALIGN16_END
|
||||
C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
|
||||
static const ALIGN16_BEG float ALIGN16_END
|
||||
C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
|
||||
static const ALIGN16_BEG float ALIGN16_END
|
||||
C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
|
||||
static const ALIGN16_BEG float ALIGN16_END
|
||||
C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
|
||||
static const ALIGN16_BEG float ALIGN16_END
|
||||
C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
|
||||
const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5));
|
||||
const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4));
|
||||
const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
|
||||
const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3));
|
||||
const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3));
|
||||
const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
|
||||
const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2));
|
||||
const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2));
|
||||
const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
|
||||
const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1));
|
||||
const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1));
|
||||
const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
|
||||
const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128 *)C0));
|
||||
const __m128 y_minus_one = _mm_sub_ps(
|
||||
y, *((__m128 *)zero_biased_exponent_is_one));
|
||||
const __m128 log2_y = _mm_mul_ps(y_minus_one , pol5_y);
|
||||
const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0));
|
||||
const __m128 y_minus_one =
|
||||
_mm_sub_ps(y, *((__m128*)zero_biased_exponent_is_one));
|
||||
const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y);
|
||||
|
||||
// Combine parts.
|
||||
log2_a = _mm_add_ps(n, log2_y);
|
||||
@ -299,38 +315,38 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
|
||||
// maximum relative error of 0.17%.
|
||||
|
||||
// To avoid over/underflow, we reduce the range of input to ]-127, 129].
|
||||
static const ALIGN16_BEG float max_input[4] ALIGN16_END =
|
||||
{129.f, 129.f, 129.f, 129.f};
|
||||
static const ALIGN16_BEG float min_input[4] ALIGN16_END =
|
||||
{-126.99999f, -126.99999f, -126.99999f, -126.99999f};
|
||||
const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input));
|
||||
const __m128 x_max = _mm_max_ps(x_min, *((__m128 *)min_input));
|
||||
static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f,
|
||||
129.f, 129.f};
|
||||
static const ALIGN16_BEG float min_input[4] ALIGN16_END = {
|
||||
-126.99999f, -126.99999f, -126.99999f, -126.99999f};
|
||||
const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input));
|
||||
const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input));
|
||||
// Compute n.
|
||||
static const ALIGN16_BEG float half[4] ALIGN16_END =
|
||||
{0.5f, 0.5f, 0.5f, 0.5f};
|
||||
const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half));
|
||||
static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f,
|
||||
0.5f, 0.5f};
|
||||
const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half));
|
||||
const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
|
||||
// Compute 2^n.
|
||||
static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END =
|
||||
{127, 127, 127, 127};
|
||||
static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {
|
||||
127, 127, 127, 127};
|
||||
static const int float_exponent_shift = 23;
|
||||
const __m128i two_n_exponent = _mm_add_epi32(
|
||||
x_minus_half_floor, *((__m128i *)float_exponent_bias));
|
||||
const __m128 two_n = _mm_castsi128_ps(_mm_slli_epi32(
|
||||
two_n_exponent, float_exponent_shift));
|
||||
const __m128i two_n_exponent =
|
||||
_mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias));
|
||||
const __m128 two_n =
|
||||
_mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift));
|
||||
// Compute y.
|
||||
const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
|
||||
// Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
|
||||
static const ALIGN16_BEG float C2[4] ALIGN16_END =
|
||||
{3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
|
||||
static const ALIGN16_BEG float C1[4] ALIGN16_END =
|
||||
{6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
|
||||
static const ALIGN16_BEG float C0[4] ALIGN16_END =
|
||||
{1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f};
|
||||
const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128 *)C2));
|
||||
const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1));
|
||||
static const ALIGN16_BEG float C2[4] ALIGN16_END = {
|
||||
3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
|
||||
static const ALIGN16_BEG float C1[4] ALIGN16_END = {
|
||||
6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
|
||||
static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f,
|
||||
1.0017247f, 1.0017247f};
|
||||
const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128*)C2));
|
||||
const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128*)C1));
|
||||
const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
|
||||
const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128 *)C0));
|
||||
const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128*)C0));
|
||||
|
||||
// Combine parts.
|
||||
a_exp_b = _mm_mul_ps(exp2_y, two_n);
|
||||
@ -338,10 +354,8 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
|
||||
return a_exp_b;
|
||||
}
|
||||
|
||||
extern const float WebRtcAec_weightCurve[65];
|
||||
extern const float WebRtcAec_overDriveCurve[65];
|
||||
|
||||
static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
|
||||
static void OverdriveAndSuppressSSE2(AecCore* aec,
|
||||
float hNl[PART_LEN1],
|
||||
const float hNlFb,
|
||||
float efw[2][PART_LEN1]) {
|
||||
int i;
|
||||
@ -350,26 +364,25 @@ static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
|
||||
const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
|
||||
const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
|
||||
// vectorized code (four at once)
|
||||
for (i = 0; i + 3 < PART_LEN1; i+=4) {
|
||||
for (i = 0; i + 3 < PART_LEN1; i += 4) {
|
||||
// Weight subbands
|
||||
__m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
|
||||
const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
|
||||
const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
|
||||
const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(
|
||||
vec_weightCurve, vec_hNlFb);
|
||||
const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(vec_weightCurve, vec_hNlFb);
|
||||
const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
|
||||
const __m128 vec_one_weightCurve_hNl = _mm_mul_ps(
|
||||
vec_one_weightCurve, vec_hNl);
|
||||
const __m128 vec_one_weightCurve_hNl =
|
||||
_mm_mul_ps(vec_one_weightCurve, vec_hNl);
|
||||
const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
|
||||
const __m128 vec_if1 = _mm_and_ps(
|
||||
bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
|
||||
vec_hNl = _mm_or_ps(vec_if0, vec_if1);
|
||||
|
||||
{
|
||||
const __m128 vec_overDriveCurve = _mm_loadu_ps(
|
||||
&WebRtcAec_overDriveCurve[i]);
|
||||
const __m128 vec_overDriveSm_overDriveCurve = _mm_mul_ps(
|
||||
vec_overDriveSm, vec_overDriveCurve);
|
||||
const __m128 vec_overDriveCurve =
|
||||
_mm_loadu_ps(&WebRtcAec_overDriveCurve[i]);
|
||||
const __m128 vec_overDriveSm_overDriveCurve =
|
||||
_mm_mul_ps(vec_overDriveSm, vec_overDriveCurve);
|
||||
vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
|
||||
_mm_storeu_ps(&hNl[i], vec_hNl);
|
||||
}
|
||||
@ -393,7 +406,7 @@ static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
|
||||
// Weight subbands
|
||||
if (hNl[i] > hNlFb) {
|
||||
hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
|
||||
(1 - WebRtcAec_weightCurve[i]) * hNl[i];
|
||||
(1 - WebRtcAec_weightCurve[i]) * hNl[i];
|
||||
}
|
||||
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
|
||||
|
||||
@ -407,11 +420,312 @@ static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
|
||||
}
|
||||
}
|
||||
|
||||
__inline static void _mm_add_ps_4x1(__m128 sum, float *dst) {
|
||||
// A+B C+D
|
||||
sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
// A+B+C+D A+B+C+D
|
||||
sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
_mm_store_ss(dst, sum);
|
||||
}
|
||||
static int PartitionDelay(const AecCore* aec) {
|
||||
// Measures the energy in each filter partition and returns the partition with
|
||||
// highest energy.
|
||||
// TODO(bjornv): Spread computational cost by computing one partition per
|
||||
// block?
|
||||
float wfEnMax = 0;
|
||||
int i;
|
||||
int delay = 0;
|
||||
|
||||
for (i = 0; i < aec->num_partitions; i++) {
|
||||
int j;
|
||||
int pos = i * PART_LEN1;
|
||||
float wfEn = 0;
|
||||
__m128 vec_wfEn = _mm_set1_ps(0.0f);
|
||||
// vectorized code (four at once)
|
||||
for (j = 0; j + 3 < PART_LEN1; j += 4) {
|
||||
const __m128 vec_wfBuf0 = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
|
||||
const __m128 vec_wfBuf1 = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
|
||||
vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0));
|
||||
vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1));
|
||||
}
|
||||
_mm_add_ps_4x1(vec_wfEn, &wfEn);
|
||||
|
||||
// scalar code for the remaining items.
|
||||
for (; j < PART_LEN1; j++) {
|
||||
wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
|
||||
aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
|
||||
}
|
||||
|
||||
if (wfEn > wfEnMax) {
|
||||
wfEnMax = wfEn;
|
||||
delay = i;
|
||||
}
|
||||
}
|
||||
return delay;
|
||||
}
|
||||
|
||||
// Updates the following smoothed Power Spectral Densities (PSD):
|
||||
// - sd : near-end
|
||||
// - se : residual echo
|
||||
// - sx : far-end
|
||||
// - sde : cross-PSD of near-end and residual echo
|
||||
// - sxd : cross-PSD of near-end and far-end
|
||||
//
|
||||
// In addition to updating the PSDs, also the filter diverge state is determined
|
||||
// upon actions are taken.
|
||||
static void SmoothedPSD(AecCore* aec,
|
||||
float efw[2][PART_LEN1],
|
||||
float dfw[2][PART_LEN1],
|
||||
float xfw[2][PART_LEN1]) {
|
||||
// Power estimate smoothing coefficients.
|
||||
const float* ptrGCoh = aec->extended_filter_enabled
|
||||
? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
|
||||
: WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
|
||||
int i;
|
||||
float sdSum = 0, seSum = 0;
|
||||
const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD);
|
||||
const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]);
|
||||
const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]);
|
||||
__m128 vec_sdSum = _mm_set1_ps(0.0f);
|
||||
__m128 vec_seSum = _mm_set1_ps(0.0f);
|
||||
|
||||
for (i = 0; i + 3 < PART_LEN1; i += 4) {
|
||||
const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]);
|
||||
const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]);
|
||||
const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]);
|
||||
const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]);
|
||||
const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]);
|
||||
const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]);
|
||||
__m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0);
|
||||
__m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0);
|
||||
__m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0);
|
||||
__m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0);
|
||||
__m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0);
|
||||
__m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0);
|
||||
vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1));
|
||||
vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1));
|
||||
vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1));
|
||||
vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15);
|
||||
vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1));
|
||||
vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1));
|
||||
vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1));
|
||||
_mm_storeu_ps(&aec->sd[i], vec_sd);
|
||||
_mm_storeu_ps(&aec->se[i], vec_se);
|
||||
_mm_storeu_ps(&aec->sx[i], vec_sx);
|
||||
|
||||
{
|
||||
const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]);
|
||||
const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);
|
||||
__m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,
|
||||
_MM_SHUFFLE(2, 0, 2, 0));
|
||||
__m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,
|
||||
_MM_SHUFFLE(3, 1, 3, 1));
|
||||
__m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0);
|
||||
__m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1);
|
||||
vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
|
||||
vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
|
||||
vec_dfwefw0011 = _mm_add_ps(vec_dfwefw0011,
|
||||
_mm_mul_ps(vec_dfw1, vec_efw1));
|
||||
vec_dfwefw0110 = _mm_sub_ps(vec_dfwefw0110,
|
||||
_mm_mul_ps(vec_dfw1, vec_efw0));
|
||||
vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1));
|
||||
vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1));
|
||||
_mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b));
|
||||
_mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));
|
||||
}
|
||||
|
||||
{
|
||||
const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]);
|
||||
const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);
|
||||
__m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,
|
||||
_MM_SHUFFLE(2, 0, 2, 0));
|
||||
__m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,
|
||||
_MM_SHUFFLE(3, 1, 3, 1));
|
||||
__m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0);
|
||||
__m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1);
|
||||
vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
|
||||
vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
|
||||
vec_dfwxfw0011 = _mm_add_ps(vec_dfwxfw0011,
|
||||
_mm_mul_ps(vec_dfw1, vec_xfw1));
|
||||
vec_dfwxfw0110 = _mm_sub_ps(vec_dfwxfw0110,
|
||||
_mm_mul_ps(vec_dfw1, vec_xfw0));
|
||||
vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1));
|
||||
vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1));
|
||||
_mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b));
|
||||
_mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));
|
||||
}
|
||||
|
||||
vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd);
|
||||
vec_seSum = _mm_add_ps(vec_seSum, vec_se);
|
||||
}
|
||||
|
||||
_mm_add_ps_4x1(vec_sdSum, &sdSum);
|
||||
_mm_add_ps_4x1(vec_seSum, &seSum);
|
||||
|
||||
for (; i < PART_LEN1; i++) {
|
||||
aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
|
||||
ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
|
||||
aec->se[i] = ptrGCoh[0] * aec->se[i] +
|
||||
ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
|
||||
// We threshold here to protect against the ill-effects of a zero farend.
|
||||
// The threshold is not arbitrarily chosen, but balances protection and
|
||||
// adverse interaction with the algorithm's tuning.
|
||||
// TODO(bjornv): investigate further why this is so sensitive.
|
||||
aec->sx[i] =
|
||||
ptrGCoh[0] * aec->sx[i] +
|
||||
ptrGCoh[1] * WEBRTC_SPL_MAX(
|
||||
xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
|
||||
WebRtcAec_kMinFarendPSD);
|
||||
|
||||
aec->sde[i][0] =
|
||||
ptrGCoh[0] * aec->sde[i][0] +
|
||||
ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
|
||||
aec->sde[i][1] =
|
||||
ptrGCoh[0] * aec->sde[i][1] +
|
||||
ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
|
||||
|
||||
aec->sxd[i][0] =
|
||||
ptrGCoh[0] * aec->sxd[i][0] +
|
||||
ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
|
||||
aec->sxd[i][1] =
|
||||
ptrGCoh[0] * aec->sxd[i][1] +
|
||||
ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
|
||||
|
||||
sdSum += aec->sd[i];
|
||||
seSum += aec->se[i];
|
||||
}
|
||||
|
||||
// Divergent filter safeguard.
|
||||
aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;
|
||||
|
||||
if (aec->divergeState)
|
||||
memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1);
|
||||
|
||||
// Reset if error is significantly larger than nearend (13 dB).
|
||||
if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum))
|
||||
memset(aec->wfBuf, 0, sizeof(aec->wfBuf));
|
||||
}
|
||||
|
||||
// Window time domain data to be used by the fft.
|
||||
__inline static void WindowData(float* x_windowed, const float* x) {
|
||||
int i;
|
||||
for (i = 0; i < PART_LEN; i += 4) {
|
||||
const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]);
|
||||
const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]);
|
||||
const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]);
|
||||
// A B C D
|
||||
__m128 vec_sqrtHanning_rev =
|
||||
_mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
|
||||
// D C B A
|
||||
vec_sqrtHanning_rev =
|
||||
_mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev,
|
||||
_MM_SHUFFLE(0, 1, 2, 3));
|
||||
_mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning));
|
||||
_mm_storeu_ps(&x_windowed[PART_LEN + i],
|
||||
_mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev));
|
||||
}
|
||||
}
|
||||
|
||||
// Puts fft output data into a complex valued array.
|
||||
__inline static void StoreAsComplex(const float* data,
|
||||
float data_complex[2][PART_LEN1]) {
|
||||
int i;
|
||||
for (i = 0; i < PART_LEN; i += 4) {
|
||||
const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]);
|
||||
const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]);
|
||||
const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4,
|
||||
_MM_SHUFFLE(2, 0, 2, 0));
|
||||
const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4,
|
||||
_MM_SHUFFLE(3, 1, 3, 1));
|
||||
_mm_storeu_ps(&data_complex[0][i], vec_a);
|
||||
_mm_storeu_ps(&data_complex[1][i], vec_b);
|
||||
}
|
||||
// fix beginning/end values
|
||||
data_complex[1][0] = 0;
|
||||
data_complex[1][PART_LEN] = 0;
|
||||
data_complex[0][0] = data[0];
|
||||
data_complex[0][PART_LEN] = data[1];
|
||||
}
|
||||
|
||||
static void SubbandCoherenceSSE2(AecCore* aec,
|
||||
float efw[2][PART_LEN1],
|
||||
float xfw[2][PART_LEN1],
|
||||
float* fft,
|
||||
float* cohde,
|
||||
float* cohxd) {
|
||||
float dfw[2][PART_LEN1];
|
||||
int i;
|
||||
|
||||
if (aec->delayEstCtr == 0)
|
||||
aec->delayIdx = PartitionDelay(aec);
|
||||
|
||||
// Use delayed far.
|
||||
memcpy(xfw,
|
||||
aec->xfwBuf + aec->delayIdx * PART_LEN1,
|
||||
sizeof(xfw[0][0]) * 2 * PART_LEN1);
|
||||
|
||||
// Windowed near fft
|
||||
WindowData(fft, aec->dBuf);
|
||||
aec_rdft_forward_128(fft);
|
||||
StoreAsComplex(fft, dfw);
|
||||
|
||||
// Windowed error fft
|
||||
WindowData(fft, aec->eBuf);
|
||||
aec_rdft_forward_128(fft);
|
||||
StoreAsComplex(fft, efw);
|
||||
|
||||
SmoothedPSD(aec, efw, dfw, xfw);
|
||||
|
||||
{
|
||||
const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f);
|
||||
|
||||
// Subband coherence
|
||||
for (i = 0; i + 3 < PART_LEN1; i += 4) {
|
||||
const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]);
|
||||
const __m128 vec_se = _mm_loadu_ps(&aec->se[i]);
|
||||
const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]);
|
||||
const __m128 vec_sdse = _mm_add_ps(vec_1eminus10,
|
||||
_mm_mul_ps(vec_sd, vec_se));
|
||||
const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10,
|
||||
_mm_mul_ps(vec_sd, vec_sx));
|
||||
const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]);
|
||||
const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);
|
||||
const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]);
|
||||
const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);
|
||||
const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,
|
||||
_MM_SHUFFLE(2, 0, 2, 0));
|
||||
const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,
|
||||
_MM_SHUFFLE(3, 1, 3, 1));
|
||||
const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,
|
||||
_MM_SHUFFLE(2, 0, 2, 0));
|
||||
const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,
|
||||
_MM_SHUFFLE(3, 1, 3, 1));
|
||||
__m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0);
|
||||
__m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0);
|
||||
vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1));
|
||||
vec_cohde = _mm_div_ps(vec_cohde, vec_sdse);
|
||||
vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1));
|
||||
vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx);
|
||||
_mm_storeu_ps(&cohde[i], vec_cohde);
|
||||
_mm_storeu_ps(&cohxd[i], vec_cohxd);
|
||||
}
|
||||
|
||||
// scalar code for the remaining items.
|
||||
for (; i < PART_LEN1; i++) {
|
||||
cohde[i] =
|
||||
(aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
|
||||
(aec->sd[i] * aec->se[i] + 1e-10f);
|
||||
cohxd[i] =
|
||||
(aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
|
||||
(aec->sx[i] * aec->sd[i] + 1e-10f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAec_InitAec_SSE2(void) {
|
||||
WebRtcAec_FilterFar = FilterFarSSE2;
|
||||
WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;
|
||||
WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
|
||||
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
|
||||
WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2;
|
||||
}
|
||||
|
||||
#endif // WEBRTC_USE_SSE2
|
||||
|
@ -19,200 +19,193 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "aec_rdft.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "system_wrappers/interface/cpu_features_wrapper.h"
|
||||
#include "typedefs.h"
|
||||
#include "webrtc/system_wrappers/interface/cpu_features_wrapper.h"
|
||||
#include "webrtc/typedefs.h"
|
||||
|
||||
// constants shared by all paths (C, SSE2).
|
||||
float rdft_w[64];
|
||||
// constants used by the C path.
|
||||
float rdft_wk3ri_first[32];
|
||||
float rdft_wk3ri_second[32];
|
||||
// constants used by SSE2 but initialized in C path.
|
||||
ALIGN16_BEG float ALIGN16_END rdft_wk1r[32];
|
||||
ALIGN16_BEG float ALIGN16_END rdft_wk2r[32];
|
||||
ALIGN16_BEG float ALIGN16_END rdft_wk3r[32];
|
||||
ALIGN16_BEG float ALIGN16_END rdft_wk1i[32];
|
||||
ALIGN16_BEG float ALIGN16_END rdft_wk2i[32];
|
||||
ALIGN16_BEG float ALIGN16_END rdft_wk3i[32];
|
||||
ALIGN16_BEG float ALIGN16_END cftmdl_wk1r[4];
|
||||
// These tables used to be computed at run-time. For example, refer to:
|
||||
// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/aec/aec_rdft.c?r=6564
|
||||
// to see the initialization code.
|
||||
const float rdft_w[64] = {
|
||||
1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f,
|
||||
0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f,
|
||||
0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f,
|
||||
0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f,
|
||||
0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f,
|
||||
0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f,
|
||||
0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f,
|
||||
0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f,
|
||||
0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f,
|
||||
0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f,
|
||||
0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f,
|
||||
0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f,
|
||||
0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f,
|
||||
0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f,
|
||||
0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f,
|
||||
0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f,
|
||||
};
|
||||
const float rdft_wk3ri_first[16] = {
|
||||
1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f,
|
||||
0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f,
|
||||
0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f,
|
||||
0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f,
|
||||
};
|
||||
const float rdft_wk3ri_second[16] = {
|
||||
-0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f,
|
||||
-0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f,
|
||||
-0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f,
|
||||
-0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
|
||||
1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f,
|
||||
0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f,
|
||||
0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f,
|
||||
0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f,
|
||||
0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
|
||||
0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f,
|
||||
0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f,
|
||||
0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
|
||||
1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f,
|
||||
0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f,
|
||||
0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f,
|
||||
0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
|
||||
0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
|
||||
0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f,
|
||||
0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f,
|
||||
0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
|
||||
1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f,
|
||||
0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
|
||||
0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f,
|
||||
-0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f,
|
||||
0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
|
||||
0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f,
|
||||
0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f,
|
||||
-0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
|
||||
-0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
|
||||
-0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
|
||||
-0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f,
|
||||
-0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f,
|
||||
-0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
|
||||
-0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f,
|
||||
-0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f,
|
||||
-0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
|
||||
-0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f,
|
||||
-0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f,
|
||||
-0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
|
||||
-0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f,
|
||||
-0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
|
||||
-0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f,
|
||||
-0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f,
|
||||
-0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
|
||||
-0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
|
||||
-0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f,
|
||||
-0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f,
|
||||
-0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f,
|
||||
-0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
|
||||
-0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f,
|
||||
-0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f,
|
||||
-0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
|
||||
0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f,
|
||||
};
|
||||
|
||||
static int ip[16];
|
||||
static void bitrv2_128_C(float* a) {
|
||||
/*
|
||||
Following things have been attempted but are no faster:
|
||||
(a) Storing the swap indexes in a LUT (index calculations are done
|
||||
for 'free' while waiting on memory/L1).
|
||||
(b) Consolidate the load/store of two consecutive floats by a 64 bit
|
||||
integer (execution is memory/L1 bound).
|
||||
(c) Do a mix of floats and 64 bit integer to maximize register
|
||||
utilization (execution is memory/L1 bound).
|
||||
(d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
|
||||
(e) Hard-coding of the offsets to completely eliminates index
|
||||
calculations.
|
||||
*/
|
||||
|
||||
static void bitrv2_32or128(int n, int *ip, float *a) {
|
||||
// n is 32 or 128
|
||||
int j, j1, k, k1, m, m2;
|
||||
unsigned int j, j1, k, k1;
|
||||
float xr, xi, yr, yi;
|
||||
|
||||
ip[0] = 0;
|
||||
{
|
||||
int l = n;
|
||||
m = 1;
|
||||
while ((m << 3) < l) {
|
||||
l >>= 1;
|
||||
for (j = 0; j < m; j++) {
|
||||
ip[m + j] = ip[j] + l;
|
||||
}
|
||||
m <<= 1;
|
||||
}
|
||||
}
|
||||
m2 = 2 * m;
|
||||
for (k = 0; k < m; k++) {
|
||||
static const int ip[4] = {0, 64, 32, 96};
|
||||
for (k = 0; k < 4; k++) {
|
||||
for (j = 0; j < k; j++) {
|
||||
j1 = 2 * j + ip[k];
|
||||
k1 = 2 * k + ip[j];
|
||||
xr = a[j1];
|
||||
xr = a[j1 + 0];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1];
|
||||
yr = a[k1 + 0];
|
||||
yi = a[k1 + 1];
|
||||
a[j1] = yr;
|
||||
a[j1 + 0] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1] = xr;
|
||||
a[k1 + 0] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
j1 += m2;
|
||||
k1 += 2 * m2;
|
||||
xr = a[j1];
|
||||
j1 += 8;
|
||||
k1 += 16;
|
||||
xr = a[j1 + 0];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1];
|
||||
yr = a[k1 + 0];
|
||||
yi = a[k1 + 1];
|
||||
a[j1] = yr;
|
||||
a[j1 + 0] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1] = xr;
|
||||
a[k1 + 0] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
j1 += m2;
|
||||
k1 -= m2;
|
||||
xr = a[j1];
|
||||
j1 += 8;
|
||||
k1 -= 8;
|
||||
xr = a[j1 + 0];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1];
|
||||
yr = a[k1 + 0];
|
||||
yi = a[k1 + 1];
|
||||
a[j1] = yr;
|
||||
a[j1 + 0] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1] = xr;
|
||||
a[k1 + 0] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
j1 += m2;
|
||||
k1 += 2 * m2;
|
||||
xr = a[j1];
|
||||
j1 += 8;
|
||||
k1 += 16;
|
||||
xr = a[j1 + 0];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1];
|
||||
yr = a[k1 + 0];
|
||||
yi = a[k1 + 1];
|
||||
a[j1] = yr;
|
||||
a[j1 + 0] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1] = xr;
|
||||
a[k1 + 0] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
}
|
||||
j1 = 2 * k + m2 + ip[k];
|
||||
k1 = j1 + m2;
|
||||
xr = a[j1];
|
||||
j1 = 2 * k + 8 + ip[k];
|
||||
k1 = j1 + 8;
|
||||
xr = a[j1 + 0];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1];
|
||||
yr = a[k1 + 0];
|
||||
yi = a[k1 + 1];
|
||||
a[j1] = yr;
|
||||
a[j1 + 0] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1] = xr;
|
||||
a[k1 + 0] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
}
|
||||
}
|
||||
|
||||
static void makewt_32(void) {
|
||||
const int nw = 32;
|
||||
int j, nwh;
|
||||
float delta, x, y;
|
||||
|
||||
ip[0] = nw;
|
||||
ip[1] = 1;
|
||||
nwh = nw >> 1;
|
||||
delta = atanf(1.0f) / nwh;
|
||||
rdft_w[0] = 1;
|
||||
rdft_w[1] = 0;
|
||||
rdft_w[nwh] = cosf(delta * nwh);
|
||||
rdft_w[nwh + 1] = rdft_w[nwh];
|
||||
for (j = 2; j < nwh; j += 2) {
|
||||
x = cosf(delta * j);
|
||||
y = sinf(delta * j);
|
||||
rdft_w[j] = x;
|
||||
rdft_w[j + 1] = y;
|
||||
rdft_w[nw - j] = y;
|
||||
rdft_w[nw - j + 1] = x;
|
||||
}
|
||||
bitrv2_32or128(nw, ip + 2, rdft_w);
|
||||
|
||||
// pre-calculate constants used by cft1st_128 and cftmdl_128...
|
||||
cftmdl_wk1r[0] = rdft_w[2];
|
||||
cftmdl_wk1r[1] = rdft_w[2];
|
||||
cftmdl_wk1r[2] = rdft_w[2];
|
||||
cftmdl_wk1r[3] = -rdft_w[2];
|
||||
{
|
||||
int k1;
|
||||
|
||||
for (k1 = 0, j = 0; j < 128; j += 16, k1 += 2) {
|
||||
const int k2 = 2 * k1;
|
||||
const float wk2r = rdft_w[k1 + 0];
|
||||
const float wk2i = rdft_w[k1 + 1];
|
||||
float wk1r, wk1i;
|
||||
// ... scalar version.
|
||||
wk1r = rdft_w[k2 + 0];
|
||||
wk1i = rdft_w[k2 + 1];
|
||||
rdft_wk3ri_first[k1 + 0] = wk1r - 2 * wk2i * wk1i;
|
||||
rdft_wk3ri_first[k1 + 1] = 2 * wk2i * wk1r - wk1i;
|
||||
wk1r = rdft_w[k2 + 2];
|
||||
wk1i = rdft_w[k2 + 3];
|
||||
rdft_wk3ri_second[k1 + 0] = wk1r - 2 * wk2r * wk1i;
|
||||
rdft_wk3ri_second[k1 + 1] = 2 * wk2r * wk1r - wk1i;
|
||||
// ... vector version.
|
||||
rdft_wk1r[k2 + 0] = rdft_w[k2 + 0];
|
||||
rdft_wk1r[k2 + 1] = rdft_w[k2 + 0];
|
||||
rdft_wk1r[k2 + 2] = rdft_w[k2 + 2];
|
||||
rdft_wk1r[k2 + 3] = rdft_w[k2 + 2];
|
||||
rdft_wk2r[k2 + 0] = rdft_w[k1 + 0];
|
||||
rdft_wk2r[k2 + 1] = rdft_w[k1 + 0];
|
||||
rdft_wk2r[k2 + 2] = -rdft_w[k1 + 1];
|
||||
rdft_wk2r[k2 + 3] = -rdft_w[k1 + 1];
|
||||
rdft_wk3r[k2 + 0] = rdft_wk3ri_first[k1 + 0];
|
||||
rdft_wk3r[k2 + 1] = rdft_wk3ri_first[k1 + 0];
|
||||
rdft_wk3r[k2 + 2] = rdft_wk3ri_second[k1 + 0];
|
||||
rdft_wk3r[k2 + 3] = rdft_wk3ri_second[k1 + 0];
|
||||
rdft_wk1i[k2 + 0] = -rdft_w[k2 + 1];
|
||||
rdft_wk1i[k2 + 1] = rdft_w[k2 + 1];
|
||||
rdft_wk1i[k2 + 2] = -rdft_w[k2 + 3];
|
||||
rdft_wk1i[k2 + 3] = rdft_w[k2 + 3];
|
||||
rdft_wk2i[k2 + 0] = -rdft_w[k1 + 1];
|
||||
rdft_wk2i[k2 + 1] = rdft_w[k1 + 1];
|
||||
rdft_wk2i[k2 + 2] = -rdft_w[k1 + 0];
|
||||
rdft_wk2i[k2 + 3] = rdft_w[k1 + 0];
|
||||
rdft_wk3i[k2 + 0] = -rdft_wk3ri_first[k1 + 1];
|
||||
rdft_wk3i[k2 + 1] = rdft_wk3ri_first[k1 + 1];
|
||||
rdft_wk3i[k2 + 2] = -rdft_wk3ri_second[k1 + 1];
|
||||
rdft_wk3i[k2 + 3] = rdft_wk3ri_second[k1 + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void makect_32(void) {
|
||||
float *c = rdft_w + 32;
|
||||
const int nc = 32;
|
||||
int j, nch;
|
||||
float delta;
|
||||
|
||||
ip[1] = nc;
|
||||
nch = nc >> 1;
|
||||
delta = atanf(1.0f) / nch;
|
||||
c[0] = cosf(delta * nch);
|
||||
c[nch] = 0.5f * c[0];
|
||||
for (j = 1; j < nch; j++) {
|
||||
c[j] = 0.5f * cosf(delta * j);
|
||||
c[nc - j] = 0.5f * sinf(delta * j);
|
||||
}
|
||||
}
|
||||
|
||||
static void cft1st_128_C(float *a) {
|
||||
static void cft1st_128_C(float* a) {
|
||||
const int n = 128;
|
||||
int j, k1, k2;
|
||||
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
|
||||
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
||||
|
||||
// The processing of the first set of elements was simplified in C to avoid
|
||||
// some operations (multiplication by zero or one, addition of two elements
|
||||
// multiplied by the same weight, ...).
|
||||
x0r = a[0] + a[2];
|
||||
x0i = a[1] + a[3];
|
||||
x1r = a[0] - a[2];
|
||||
@ -311,7 +304,7 @@ static void cft1st_128_C(float *a) {
|
||||
}
|
||||
}
|
||||
|
||||
static void cftmdl_128_C(float *a) {
|
||||
static void cftmdl_128_C(float* a) {
|
||||
const int l = 8;
|
||||
const int n = 128;
|
||||
const int m = 32;
|
||||
@ -320,7 +313,7 @@ static void cftmdl_128_C(float *a) {
|
||||
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
||||
|
||||
for (j0 = 0; j0 < l; j0 += 2) {
|
||||
j1 = j0 + 8;
|
||||
j1 = j0 + 8;
|
||||
j2 = j0 + 16;
|
||||
j3 = j0 + 24;
|
||||
x0r = a[j0 + 0] + a[j1 + 0];
|
||||
@ -342,7 +335,7 @@ static void cftmdl_128_C(float *a) {
|
||||
}
|
||||
wk1r = rdft_w[2];
|
||||
for (j0 = m; j0 < l + m; j0 += 2) {
|
||||
j1 = j0 + 8;
|
||||
j1 = j0 + 8;
|
||||
j2 = j0 + 16;
|
||||
j3 = j0 + 24;
|
||||
x0r = a[j0 + 0] + a[j1 + 0];
|
||||
@ -378,7 +371,7 @@ static void cftmdl_128_C(float *a) {
|
||||
wk3r = rdft_wk3ri_first[k1 + 0];
|
||||
wk3i = rdft_wk3ri_first[k1 + 1];
|
||||
for (j0 = k; j0 < l + k; j0 += 2) {
|
||||
j1 = j0 + 8;
|
||||
j1 = j0 + 8;
|
||||
j2 = j0 + 16;
|
||||
j3 = j0 + 24;
|
||||
x0r = a[j0 + 0] + a[j1 + 0];
|
||||
@ -409,7 +402,7 @@ static void cftmdl_128_C(float *a) {
|
||||
wk3r = rdft_wk3ri_second[k1 + 0];
|
||||
wk3i = rdft_wk3ri_second[k1 + 1];
|
||||
for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
|
||||
j1 = j0 + 8;
|
||||
j1 = j0 + 8;
|
||||
j2 = j0 + 16;
|
||||
j3 = j0 + 24;
|
||||
x0r = a[j0 + 0] + a[j1 + 0];
|
||||
@ -438,7 +431,7 @@ static void cftmdl_128_C(float *a) {
|
||||
}
|
||||
}
|
||||
|
||||
static void cftfsub_128(float *a) {
|
||||
static void cftfsub_128_C(float* a) {
|
||||
int j, j1, j2, j3, l;
|
||||
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
||||
|
||||
@ -468,7 +461,7 @@ static void cftfsub_128(float *a) {
|
||||
}
|
||||
}
|
||||
|
||||
static void cftbsub_128(float *a) {
|
||||
static void cftbsub_128_C(float* a) {
|
||||
int j, j1, j2, j3, l;
|
||||
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
||||
|
||||
@ -499,14 +492,14 @@ static void cftbsub_128(float *a) {
|
||||
}
|
||||
}
|
||||
|
||||
static void rftfsub_128_C(float *a) {
|
||||
const float *c = rdft_w + 32;
|
||||
static void rftfsub_128_C(float* a) {
|
||||
const float* c = rdft_w + 32;
|
||||
int j1, j2, k1, k2;
|
||||
float wkr, wki, xr, xi, yr, yi;
|
||||
|
||||
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
|
||||
k2 = 128 - j2;
|
||||
k1 = 32 - j1;
|
||||
k1 = 32 - j1;
|
||||
wkr = 0.5f - c[k1];
|
||||
wki = c[j1];
|
||||
xr = a[j2 + 0] - a[k2 + 0];
|
||||
@ -520,15 +513,15 @@ static void rftfsub_128_C(float *a) {
|
||||
}
|
||||
}
|
||||
|
||||
static void rftbsub_128_C(float *a) {
|
||||
const float *c = rdft_w + 32;
|
||||
static void rftbsub_128_C(float* a) {
|
||||
const float* c = rdft_w + 32;
|
||||
int j1, j2, k1, k2;
|
||||
float wkr, wki, xr, xi, yr, yi;
|
||||
|
||||
a[1] = -a[1];
|
||||
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
|
||||
k2 = 128 - j2;
|
||||
k1 = 32 - j1;
|
||||
k1 = 32 - j1;
|
||||
wkr = 0.5f - c[k1];
|
||||
wki = c[j1];
|
||||
xr = a[j2 + 0] - a[k2 + 0];
|
||||
@ -543,11 +536,9 @@ static void rftbsub_128_C(float *a) {
|
||||
a[65] = -a[65];
|
||||
}
|
||||
|
||||
void aec_rdft_forward_128(float *a) {
|
||||
const int n = 128;
|
||||
void aec_rdft_forward_128(float* a) {
|
||||
float xi;
|
||||
|
||||
bitrv2_32or128(n, ip + 2, a);
|
||||
bitrv2_128(a);
|
||||
cftfsub_128(a);
|
||||
rftfsub_128(a);
|
||||
xi = a[0] - a[1];
|
||||
@ -555,33 +546,44 @@ void aec_rdft_forward_128(float *a) {
|
||||
a[1] = xi;
|
||||
}
|
||||
|
||||
void aec_rdft_inverse_128(float *a) {
|
||||
const int n = 128;
|
||||
|
||||
void aec_rdft_inverse_128(float* a) {
|
||||
a[1] = 0.5f * (a[0] - a[1]);
|
||||
a[0] -= a[1];
|
||||
rftbsub_128(a);
|
||||
bitrv2_32or128(n, ip + 2, a);
|
||||
bitrv2_128(a);
|
||||
cftbsub_128(a);
|
||||
}
|
||||
|
||||
// code path selection
|
||||
rft_sub_128_t cft1st_128;
|
||||
rft_sub_128_t cftmdl_128;
|
||||
rft_sub_128_t rftfsub_128;
|
||||
rft_sub_128_t rftbsub_128;
|
||||
RftSub128 cft1st_128;
|
||||
RftSub128 cftmdl_128;
|
||||
RftSub128 rftfsub_128;
|
||||
RftSub128 rftbsub_128;
|
||||
RftSub128 cftfsub_128;
|
||||
RftSub128 cftbsub_128;
|
||||
RftSub128 bitrv2_128;
|
||||
|
||||
void aec_rdft_init(void) {
|
||||
cft1st_128 = cft1st_128_C;
|
||||
cftmdl_128 = cftmdl_128_C;
|
||||
rftfsub_128 = rftfsub_128_C;
|
||||
rftbsub_128 = rftbsub_128_C;
|
||||
cftfsub_128 = cftfsub_128_C;
|
||||
cftbsub_128 = cftbsub_128_C;
|
||||
bitrv2_128 = bitrv2_128_C;
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
if (WebRtc_GetCPUInfo(kSSE2)) {
|
||||
#if defined(WEBRTC_USE_SSE2)
|
||||
aec_rdft_init_sse2();
|
||||
#endif
|
||||
}
|
||||
// init library constants.
|
||||
makewt_32();
|
||||
makect_32();
|
||||
#endif
|
||||
#if defined(MIPS_FPU_LE)
|
||||
aec_rdft_init_mips();
|
||||
#endif
|
||||
#if defined(WEBRTC_HAS_NEON)
|
||||
aec_rdft_init_neon();
|
||||
#elif defined(WEBRTC_DETECT_NEON)
|
||||
if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
|
||||
aec_rdft_init_neon();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -11,6 +11,8 @@
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
|
||||
|
||||
#include "webrtc/modules/audio_processing/aec/aec_common.h"
|
||||
|
||||
// These intrinsics were unavailable before VS 2008.
|
||||
// TODO(andrew): move to a common file.
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1500
|
||||
@ -19,39 +21,41 @@ static __inline __m128 _mm_castsi128_ps(__m128i a) { return *(__m128*)&a; }
|
||||
static __inline __m128i _mm_castps_si128(__m128 a) { return *(__m128i*)&a; }
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER /* visual c++ */
|
||||
# define ALIGN16_BEG __declspec(align(16))
|
||||
# define ALIGN16_END
|
||||
#else /* gcc or icc */
|
||||
# define ALIGN16_BEG
|
||||
# define ALIGN16_END __attribute__((aligned(16)))
|
||||
#endif
|
||||
|
||||
// constants shared by all paths (C, SSE2).
|
||||
extern float rdft_w[64];
|
||||
// constants used by the C path.
|
||||
extern float rdft_wk3ri_first[32];
|
||||
extern float rdft_wk3ri_second[32];
|
||||
// constants used by SSE2 but initialized in C path.
|
||||
extern float rdft_wk1r[32];
|
||||
extern float rdft_wk2r[32];
|
||||
extern float rdft_wk3r[32];
|
||||
extern float rdft_wk1i[32];
|
||||
extern float rdft_wk2i[32];
|
||||
extern float rdft_wk3i[32];
|
||||
extern float cftmdl_wk1r[4];
|
||||
// Constants shared by all paths (C, SSE2, NEON).
|
||||
extern const float rdft_w[64];
|
||||
// Constants used by the C path.
|
||||
extern const float rdft_wk3ri_first[16];
|
||||
extern const float rdft_wk3ri_second[16];
|
||||
// Constants used by SSE2 and NEON but initialized in the C path.
|
||||
extern ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32];
|
||||
extern ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32];
|
||||
extern ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32];
|
||||
extern ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32];
|
||||
extern ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32];
|
||||
extern ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32];
|
||||
extern ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4];
|
||||
|
||||
// code path selection function pointers
|
||||
typedef void (*rft_sub_128_t)(float *a);
|
||||
extern rft_sub_128_t rftfsub_128;
|
||||
extern rft_sub_128_t rftbsub_128;
|
||||
extern rft_sub_128_t cft1st_128;
|
||||
extern rft_sub_128_t cftmdl_128;
|
||||
typedef void (*RftSub128)(float* a);
|
||||
extern RftSub128 rftfsub_128;
|
||||
extern RftSub128 rftbsub_128;
|
||||
extern RftSub128 cft1st_128;
|
||||
extern RftSub128 cftmdl_128;
|
||||
extern RftSub128 cftfsub_128;
|
||||
extern RftSub128 cftbsub_128;
|
||||
extern RftSub128 bitrv2_128;
|
||||
|
||||
// entry points
|
||||
void aec_rdft_init(void);
|
||||
void aec_rdft_init_sse2(void);
|
||||
void aec_rdft_forward_128(float *a);
|
||||
void aec_rdft_inverse_128(float *a);
|
||||
void aec_rdft_forward_128(float* a);
|
||||
void aec_rdft_inverse_128(float* a);
|
||||
|
||||
#if defined(MIPS_FPU_LE)
|
||||
void aec_rdft_init_mips(void);
|
||||
#endif
|
||||
#if defined(WEBRTC_DETECT_NEON) || defined(WEBRTC_HAS_NEON)
|
||||
void aec_rdft_init_neon(void);
|
||||
#endif
|
||||
|
||||
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
|
||||
|
1187
webrtc/modules/audio_processing/aec/aec_rdft_mips.c
Normal file
1187
webrtc/modules/audio_processing/aec/aec_rdft_mips.c
Normal file
File diff suppressed because it is too large
Load Diff
355
webrtc/modules/audio_processing/aec/aec_rdft_neon.c
Normal file
355
webrtc/modules/audio_processing/aec/aec_rdft_neon.c
Normal file
@ -0,0 +1,355 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The rdft AEC algorithm, neon version of speed-critical functions.
|
||||
*
|
||||
* Based on the sse2 version.
|
||||
*/
|
||||
|
||||
|
||||
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
static const ALIGN16_BEG float ALIGN16_END
|
||||
k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
|
||||
|
||||
static void cft1st_128_neon(float* a) {
|
||||
const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
|
||||
int j, k2;
|
||||
|
||||
for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
|
||||
float32x4_t a00v = vld1q_f32(&a[j + 0]);
|
||||
float32x4_t a04v = vld1q_f32(&a[j + 4]);
|
||||
float32x4_t a08v = vld1q_f32(&a[j + 8]);
|
||||
float32x4_t a12v = vld1q_f32(&a[j + 12]);
|
||||
float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
|
||||
float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
|
||||
float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
|
||||
float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
|
||||
const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
|
||||
const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
|
||||
const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
|
||||
const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
|
||||
const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
|
||||
const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
|
||||
float32x4_t x0v = vaddq_f32(a01v, a23v);
|
||||
const float32x4_t x1v = vsubq_f32(a01v, a23v);
|
||||
const float32x4_t x2v = vaddq_f32(a45v, a67v);
|
||||
const float32x4_t x3v = vsubq_f32(a45v, a67v);
|
||||
const float32x4_t x3w = vrev64q_f32(x3v);
|
||||
float32x4_t x0w;
|
||||
a01v = vaddq_f32(x0v, x2v);
|
||||
x0v = vsubq_f32(x0v, x2v);
|
||||
x0w = vrev64q_f32(x0v);
|
||||
a45v = vmulq_f32(wk2rv, x0v);
|
||||
a45v = vmlaq_f32(a45v, wk2iv, x0w);
|
||||
x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
|
||||
x0w = vrev64q_f32(x0v);
|
||||
a23v = vmulq_f32(wk1rv, x0v);
|
||||
a23v = vmlaq_f32(a23v, wk1iv, x0w);
|
||||
x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
|
||||
x0w = vrev64q_f32(x0v);
|
||||
a67v = vmulq_f32(wk3rv, x0v);
|
||||
a67v = vmlaq_f32(a67v, wk3iv, x0w);
|
||||
a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
|
||||
a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
|
||||
a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
|
||||
a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
|
||||
vst1q_f32(&a[j + 0], a00v);
|
||||
vst1q_f32(&a[j + 4], a04v);
|
||||
vst1q_f32(&a[j + 8], a08v);
|
||||
vst1q_f32(&a[j + 12], a12v);
|
||||
}
|
||||
}
|
||||
|
||||
static void cftmdl_128_neon(float* a) {
|
||||
int j;
|
||||
const int l = 8;
|
||||
const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
|
||||
float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
|
||||
|
||||
for (j = 0; j < l; j += 2) {
|
||||
const float32x2_t a_00 = vld1_f32(&a[j + 0]);
|
||||
const float32x2_t a_08 = vld1_f32(&a[j + 8]);
|
||||
const float32x2_t a_32 = vld1_f32(&a[j + 32]);
|
||||
const float32x2_t a_40 = vld1_f32(&a[j + 40]);
|
||||
const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
|
||||
const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
|
||||
const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
|
||||
const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
|
||||
const float32x2_t a_16 = vld1_f32(&a[j + 16]);
|
||||
const float32x2_t a_24 = vld1_f32(&a[j + 24]);
|
||||
const float32x2_t a_48 = vld1_f32(&a[j + 48]);
|
||||
const float32x2_t a_56 = vld1_f32(&a[j + 56]);
|
||||
const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
|
||||
const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
|
||||
const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
|
||||
const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
|
||||
const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
|
||||
const float32x4_t x1_x3_add =
|
||||
vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
const float32x4_t x1_x3_sub =
|
||||
vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
|
||||
const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
|
||||
const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
|
||||
const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
|
||||
const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
|
||||
const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
|
||||
const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
|
||||
const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
|
||||
const float32x4_t xx1_rev = vrev64q_f32(xx1);
|
||||
const float32x4_t yy4_rev = vrev64q_f32(yy4);
|
||||
|
||||
vst1_f32(&a[j + 0], vget_low_f32(xx0));
|
||||
vst1_f32(&a[j + 32], vget_high_f32(xx0));
|
||||
vst1_f32(&a[j + 16], vget_low_f32(xx1));
|
||||
vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));
|
||||
|
||||
a[j + 48] = -a[j + 48];
|
||||
|
||||
vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
|
||||
vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
|
||||
vst1_f32(&a[j + 40], vget_low_f32(yy4));
|
||||
vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
|
||||
}
|
||||
|
||||
{
|
||||
const int k = 64;
|
||||
const int k1 = 2;
|
||||
const int k2 = 2 * k1;
|
||||
const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
|
||||
const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
|
||||
const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
|
||||
const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
|
||||
const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
|
||||
wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
|
||||
for (j = k; j < l + k; j += 2) {
|
||||
const float32x2_t a_00 = vld1_f32(&a[j + 0]);
|
||||
const float32x2_t a_08 = vld1_f32(&a[j + 8]);
|
||||
const float32x2_t a_32 = vld1_f32(&a[j + 32]);
|
||||
const float32x2_t a_40 = vld1_f32(&a[j + 40]);
|
||||
const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
|
||||
const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
|
||||
const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
|
||||
const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
|
||||
const float32x2_t a_16 = vld1_f32(&a[j + 16]);
|
||||
const float32x2_t a_24 = vld1_f32(&a[j + 24]);
|
||||
const float32x2_t a_48 = vld1_f32(&a[j + 48]);
|
||||
const float32x2_t a_56 = vld1_f32(&a[j + 56]);
|
||||
const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
|
||||
const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
|
||||
const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
|
||||
const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
|
||||
const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
|
||||
const float32x4_t x1_x3_add =
|
||||
vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
const float32x4_t x1_x3_sub =
|
||||
vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
|
||||
float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
|
||||
float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
|
||||
xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
|
||||
xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
|
||||
xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));
|
||||
|
||||
vst1_f32(&a[j + 0], vget_low_f32(xx));
|
||||
vst1_f32(&a[j + 32], vget_high_f32(xx));
|
||||
vst1_f32(&a[j + 16], vget_low_f32(xx4));
|
||||
vst1_f32(&a[j + 48], vget_high_f32(xx4));
|
||||
vst1_f32(&a[j + 8], vget_low_f32(xx12));
|
||||
vst1_f32(&a[j + 40], vget_high_f32(xx12));
|
||||
vst1_f32(&a[j + 24], vget_low_f32(xx22));
|
||||
vst1_f32(&a[j + 56], vget_high_f32(xx22));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
|
||||
// A B C D -> C D A B
|
||||
const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
|
||||
// C D A B -> D C B A
|
||||
return vrev64q_f32(rev);
|
||||
}
|
||||
|
||||
static void rftfsub_128_neon(float* a) {
|
||||
const float* c = rdft_w + 32;
|
||||
int j1, j2;
|
||||
const float32x4_t mm_half = vdupq_n_f32(0.5f);
|
||||
|
||||
// Vectorized code (four at once).
|
||||
// Note: commented number are indexes for the first iteration of the loop.
|
||||
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
|
||||
// Load 'wk'.
|
||||
const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
|
||||
const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
|
||||
const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
|
||||
const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
|
||||
const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
|
||||
// Load and shuffle 'a'.
|
||||
// 2, 4, 6, 8, 3, 5, 7, 9
|
||||
float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
|
||||
// 120, 122, 124, 126, 121, 123, 125, 127,
|
||||
const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
|
||||
// 126, 124, 122, 120
|
||||
const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
|
||||
// 127, 125, 123, 121
|
||||
const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
|
||||
// Calculate 'x'.
|
||||
const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
|
||||
// 2-126, 4-124, 6-122, 8-120,
|
||||
const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
|
||||
// 3-127, 5-125, 7-123, 9-121,
|
||||
// Calculate product into 'y'.
|
||||
// yr = wkr * xr - wki * xi;
|
||||
// yi = wkr * xi + wki * xr;
|
||||
const float32x4_t a_ = vmulq_f32(wkr_, xr_);
|
||||
const float32x4_t b_ = vmulq_f32(wki_, xi_);
|
||||
const float32x4_t c_ = vmulq_f32(wkr_, xi_);
|
||||
const float32x4_t d_ = vmulq_f32(wki_, xr_);
|
||||
const float32x4_t yr_ = vsubq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,
|
||||
const float32x4_t yi_ = vaddq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,
|
||||
// Update 'a'.
|
||||
// a[j2 + 0] -= yr;
|
||||
// a[j2 + 1] -= yi;
|
||||
// a[k2 + 0] += yr;
|
||||
// a[k2 + 1] -= yi;
|
||||
// 126, 124, 122, 120,
|
||||
const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
|
||||
// 127, 125, 123, 121,
|
||||
const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);
|
||||
// Shuffle in right order and store.
|
||||
const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
|
||||
const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
|
||||
// 124, 125, 126, 127, 120, 121, 122, 123
|
||||
const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
|
||||
// 2, 4, 6, 8,
|
||||
a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
|
||||
// 3, 5, 7, 9,
|
||||
a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);
|
||||
// 2, 3, 4, 5, 6, 7, 8, 9,
|
||||
vst2q_f32(&a[0 + j2], a_j2_p);
|
||||
|
||||
vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
|
||||
vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
|
||||
}
|
||||
|
||||
// Scalar code for the remaining items.
|
||||
for (; j2 < 64; j1 += 1, j2 += 2) {
|
||||
const int k2 = 128 - j2;
|
||||
const int k1 = 32 - j1;
|
||||
const float wkr = 0.5f - c[k1];
|
||||
const float wki = c[j1];
|
||||
const float xr = a[j2 + 0] - a[k2 + 0];
|
||||
const float xi = a[j2 + 1] + a[k2 + 1];
|
||||
const float yr = wkr * xr - wki * xi;
|
||||
const float yi = wkr * xi + wki * xr;
|
||||
a[j2 + 0] -= yr;
|
||||
a[j2 + 1] -= yi;
|
||||
a[k2 + 0] += yr;
|
||||
a[k2 + 1] -= yi;
|
||||
}
|
||||
}
|
||||
|
||||
static void rftbsub_128_neon(float* a) {
|
||||
const float* c = rdft_w + 32;
|
||||
int j1, j2;
|
||||
const float32x4_t mm_half = vdupq_n_f32(0.5f);
|
||||
|
||||
a[1] = -a[1];
|
||||
// Vectorized code (four at once).
|
||||
// Note: commented number are indexes for the first iteration of the loop.
|
||||
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
|
||||
// Load 'wk'.
|
||||
const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
|
||||
const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
|
||||
const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
|
||||
const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
|
||||
const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
|
||||
// Load and shuffle 'a'.
|
||||
// 2, 4, 6, 8, 3, 5, 7, 9
|
||||
float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
|
||||
// 120, 122, 124, 126, 121, 123, 125, 127,
|
||||
const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
|
||||
// 126, 124, 122, 120
|
||||
const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
|
||||
// 127, 125, 123, 121
|
||||
const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
|
||||
// Calculate 'x'.
|
||||
const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
|
||||
// 2-126, 4-124, 6-122, 8-120,
|
||||
const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
|
||||
// 3-127, 5-125, 7-123, 9-121,
|
||||
// Calculate product into 'y'.
|
||||
// yr = wkr * xr - wki * xi;
|
||||
// yi = wkr * xi + wki * xr;
|
||||
const float32x4_t a_ = vmulq_f32(wkr_, xr_);
|
||||
const float32x4_t b_ = vmulq_f32(wki_, xi_);
|
||||
const float32x4_t c_ = vmulq_f32(wkr_, xi_);
|
||||
const float32x4_t d_ = vmulq_f32(wki_, xr_);
|
||||
const float32x4_t yr_ = vaddq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,
|
||||
const float32x4_t yi_ = vsubq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,
|
||||
// Update 'a'.
|
||||
// a[j2 + 0] -= yr;
|
||||
// a[j2 + 1] -= yi;
|
||||
// a[k2 + 0] += yr;
|
||||
// a[k2 + 1] -= yi;
|
||||
// 126, 124, 122, 120,
|
||||
const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
|
||||
// 127, 125, 123, 121,
|
||||
const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
|
||||
// Shuffle in right order and store.
|
||||
// 2, 3, 4, 5, 6, 7, 8, 9,
|
||||
const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
|
||||
const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
|
||||
// 124, 125, 126, 127, 120, 121, 122, 123
|
||||
const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
|
||||
// 2, 4, 6, 8,
|
||||
a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
|
||||
// 3, 5, 7, 9,
|
||||
a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
|
||||
// 2, 3, 4, 5, 6, 7, 8, 9,
|
||||
vst2q_f32(&a[0 + j2], a_j2_p);
|
||||
|
||||
vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
|
||||
vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
|
||||
}
|
||||
|
||||
// Scalar code for the remaining items.
|
||||
for (; j2 < 64; j1 += 1, j2 += 2) {
|
||||
const int k2 = 128 - j2;
|
||||
const int k1 = 32 - j1;
|
||||
const float wkr = 0.5f - c[k1];
|
||||
const float wki = c[j1];
|
||||
const float xr = a[j2 + 0] - a[k2 + 0];
|
||||
const float xi = a[j2 + 1] + a[k2 + 1];
|
||||
const float yr = wkr * xr + wki * xi;
|
||||
const float yi = wkr * xi - wki * xr;
|
||||
a[j2 + 0] = a[j2 + 0] - yr;
|
||||
a[j2 + 1] = yi - a[j2 + 1];
|
||||
a[k2 + 0] = yr + a[k2 + 0];
|
||||
a[k2 + 1] = yi - a[k2 + 1];
|
||||
}
|
||||
a[65] = -a[65];
|
||||
}
|
||||
|
||||
void aec_rdft_init_neon(void) {
|
||||
cft1st_128 = cft1st_128_neon;
|
||||
cftmdl_128 = cftmdl_128_neon;
|
||||
rftfsub_128 = rftfsub_128_neon;
|
||||
rftbsub_128 = rftbsub_128_neon;
|
||||
}
|
||||
|
@ -8,172 +8,168 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "typedefs.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
|
||||
|
||||
#if defined(WEBRTC_USE_SSE2)
|
||||
#include <emmintrin.h>
|
||||
|
||||
#include "aec_rdft.h"
|
||||
static const ALIGN16_BEG float ALIGN16_END
|
||||
k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
|
||||
|
||||
static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] =
|
||||
{-1.f, 1.f, -1.f, 1.f};
|
||||
|
||||
static void cft1st_128_SSE2(float *a) {
|
||||
static void cft1st_128_SSE2(float* a) {
|
||||
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
|
||||
int j, k2;
|
||||
|
||||
for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
|
||||
__m128 a00v = _mm_loadu_ps(&a[j + 0]);
|
||||
__m128 a04v = _mm_loadu_ps(&a[j + 4]);
|
||||
__m128 a08v = _mm_loadu_ps(&a[j + 8]);
|
||||
__m128 a12v = _mm_loadu_ps(&a[j + 12]);
|
||||
__m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1 ,0));
|
||||
__m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3 ,2));
|
||||
__m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1 ,0));
|
||||
__m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3 ,2));
|
||||
__m128 a00v = _mm_loadu_ps(&a[j + 0]);
|
||||
__m128 a04v = _mm_loadu_ps(&a[j + 4]);
|
||||
__m128 a08v = _mm_loadu_ps(&a[j + 8]);
|
||||
__m128 a12v = _mm_loadu_ps(&a[j + 12]);
|
||||
__m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
__m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
__m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
__m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
|
||||
const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
|
||||
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
|
||||
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
|
||||
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
|
||||
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
|
||||
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
|
||||
__m128 x0v = _mm_add_ps(a01v, a23v);
|
||||
const __m128 x1v = _mm_sub_ps(a01v, a23v);
|
||||
const __m128 x2v = _mm_add_ps(a45v, a67v);
|
||||
const __m128 x3v = _mm_sub_ps(a45v, a67v);
|
||||
__m128 x0w;
|
||||
a01v = _mm_add_ps(x0v, x2v);
|
||||
x0v = _mm_sub_ps(x0v, x2v);
|
||||
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
|
||||
const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
|
||||
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
|
||||
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
|
||||
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
|
||||
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
|
||||
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
|
||||
__m128 x0v = _mm_add_ps(a01v, a23v);
|
||||
const __m128 x1v = _mm_sub_ps(a01v, a23v);
|
||||
const __m128 x2v = _mm_add_ps(a45v, a67v);
|
||||
const __m128 x3v = _mm_sub_ps(a45v, a67v);
|
||||
__m128 x0w;
|
||||
a01v = _mm_add_ps(x0v, x2v);
|
||||
x0v = _mm_sub_ps(x0v, x2v);
|
||||
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
{
|
||||
const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
|
||||
const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
|
||||
a45v = _mm_add_ps(a45_0v, a45_1v);
|
||||
a45v = _mm_add_ps(a45_0v, a45_1v);
|
||||
}
|
||||
{
|
||||
__m128 a23_0v, a23_1v;
|
||||
const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0 ,1));
|
||||
const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
|
||||
x0v = _mm_add_ps(x1v, x3s);
|
||||
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
|
||||
a23_0v = _mm_mul_ps(wk1rv, x0v);
|
||||
a23_1v = _mm_mul_ps(wk1iv, x0w);
|
||||
a23v = _mm_add_ps(a23_0v, a23_1v);
|
||||
__m128 a23_0v, a23_1v;
|
||||
const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
|
||||
x0v = _mm_add_ps(x1v, x3s);
|
||||
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
a23_0v = _mm_mul_ps(wk1rv, x0v);
|
||||
a23_1v = _mm_mul_ps(wk1iv, x0w);
|
||||
a23v = _mm_add_ps(a23_0v, a23_1v);
|
||||
|
||||
x0v = _mm_sub_ps(x1v, x3s);
|
||||
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
|
||||
x0v = _mm_sub_ps(x1v, x3s);
|
||||
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
}
|
||||
{
|
||||
const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
|
||||
const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
|
||||
a67v = _mm_add_ps(a67_0v, a67_1v);
|
||||
a67v = _mm_add_ps(a67_0v, a67_1v);
|
||||
}
|
||||
|
||||
a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1 ,0));
|
||||
a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1 ,0));
|
||||
a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3 ,2));
|
||||
a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3 ,2));
|
||||
_mm_storeu_ps(&a[j + 0], a00v);
|
||||
_mm_storeu_ps(&a[j + 4], a04v);
|
||||
_mm_storeu_ps(&a[j + 8], a08v);
|
||||
a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
_mm_storeu_ps(&a[j + 0], a00v);
|
||||
_mm_storeu_ps(&a[j + 4], a04v);
|
||||
_mm_storeu_ps(&a[j + 8], a08v);
|
||||
_mm_storeu_ps(&a[j + 12], a12v);
|
||||
}
|
||||
}
|
||||
|
||||
static void cftmdl_128_SSE2(float *a) {
|
||||
static void cftmdl_128_SSE2(float* a) {
|
||||
const int l = 8;
|
||||
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
|
||||
int j0;
|
||||
|
||||
__m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
|
||||
for (j0 = 0; j0 < l; j0 += 2) {
|
||||
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
|
||||
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
|
||||
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
|
||||
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
|
||||
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
|
||||
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
|
||||
const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
|
||||
_mm_castsi128_ps(a_32),
|
||||
_MM_SHUFFLE(1, 0, 1 ,0));
|
||||
const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
|
||||
_mm_castsi128_ps(a_40),
|
||||
_MM_SHUFFLE(1, 0, 1 ,0));
|
||||
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
|
||||
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
|
||||
const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
|
||||
_mm_castsi128_ps(a_32),
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
|
||||
_mm_castsi128_ps(a_40),
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
|
||||
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
|
||||
|
||||
const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
|
||||
const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
|
||||
const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
|
||||
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
|
||||
const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
|
||||
_mm_castsi128_ps(a_48),
|
||||
_MM_SHUFFLE(1, 0, 1 ,0));
|
||||
const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
|
||||
_mm_castsi128_ps(a_56),
|
||||
_MM_SHUFFLE(1, 0, 1 ,0));
|
||||
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
|
||||
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
|
||||
const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
|
||||
_mm_castsi128_ps(a_48),
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
|
||||
_mm_castsi128_ps(a_56),
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
|
||||
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
|
||||
|
||||
const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
|
||||
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(
|
||||
_mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1),
|
||||
_MM_SHUFFLE(2, 3, 0, 1)));
|
||||
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
|
||||
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
|
||||
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
|
||||
_mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
|
||||
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
|
||||
|
||||
const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub,
|
||||
_MM_SHUFFLE(2, 2, 2 ,2));
|
||||
const __m128 yy1 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub,
|
||||
_MM_SHUFFLE(3, 3, 3 ,3));
|
||||
const __m128 yy0 =
|
||||
_mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
const __m128 yy1 =
|
||||
_mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
|
||||
const __m128 yy3 = _mm_add_ps(yy0, yy2);
|
||||
const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 32],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx0),
|
||||
_MM_SHUFFLE(3, 2, 3, 2)));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 32],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 48],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx1),
|
||||
_MM_SHUFFLE(2, 3, 2, 3)));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 48],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
|
||||
a[j0 + 48] = -a[j0 + 48];
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 56],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(yy4),
|
||||
_MM_SHUFFLE(2, 3, 2, 3)));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 56],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
|
||||
}
|
||||
|
||||
{
|
||||
int k = 64;
|
||||
int k1 = 2;
|
||||
int k2 = 2 * k1;
|
||||
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2+0]);
|
||||
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2+0]);
|
||||
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2+0]);
|
||||
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2+0]);
|
||||
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2+0]);
|
||||
wk1rv = _mm_load_ps(&rdft_wk1r[k2+0]);
|
||||
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
|
||||
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
|
||||
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
|
||||
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
|
||||
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
|
||||
wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
|
||||
for (j0 = k; j0 < l + k; j0 += 2) {
|
||||
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
|
||||
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
|
||||
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
|
||||
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
|
||||
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
|
||||
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
|
||||
const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
|
||||
_mm_castsi128_ps(a_32),
|
||||
_MM_SHUFFLE(1, 0, 1 ,0));
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
|
||||
_mm_castsi128_ps(a_40),
|
||||
_MM_SHUFFLE(1, 0, 1 ,0));
|
||||
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
|
||||
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
|
||||
|
||||
const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
|
||||
@ -182,100 +178,102 @@ static void cftmdl_128_SSE2(float *a) {
|
||||
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
|
||||
const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
|
||||
_mm_castsi128_ps(a_48),
|
||||
_MM_SHUFFLE(1, 0, 1 ,0));
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
|
||||
_mm_castsi128_ps(a_56),
|
||||
_MM_SHUFFLE(1, 0, 1 ,0));
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
|
||||
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
|
||||
|
||||
const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const __m128 xx2 = _mm_mul_ps(xx1 , wk2rv);
|
||||
const __m128 xx3 = _mm_mul_ps(wk2iv,
|
||||
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1),
|
||||
_MM_SHUFFLE(2, 3, 0, 1))));
|
||||
const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
|
||||
const __m128 xx3 =
|
||||
_mm_mul_ps(wk2iv,
|
||||
_mm_castsi128_ps(_mm_shuffle_epi32(
|
||||
_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1))));
|
||||
const __m128 xx4 = _mm_add_ps(xx2, xx3);
|
||||
|
||||
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(
|
||||
_mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1),
|
||||
_MM_SHUFFLE(2, 3, 0, 1)));
|
||||
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
|
||||
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
|
||||
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
|
||||
_mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
|
||||
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
|
||||
|
||||
const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
|
||||
const __m128 xx11 = _mm_mul_ps(wk1iv,
|
||||
const __m128 xx11 = _mm_mul_ps(
|
||||
wk1iv,
|
||||
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
|
||||
_MM_SHUFFLE(2, 3, 0, 1))));
|
||||
const __m128 xx12 = _mm_add_ps(xx10, xx11);
|
||||
|
||||
const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
|
||||
const __m128 xx21 = _mm_mul_ps(wk3iv,
|
||||
const __m128 xx21 = _mm_mul_ps(
|
||||
wk3iv,
|
||||
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
|
||||
_MM_SHUFFLE(2, 3, 0, 1))));
|
||||
_MM_SHUFFLE(2, 3, 0, 1))));
|
||||
const __m128 xx22 = _mm_add_ps(xx20, xx21);
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 32],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx),
|
||||
_MM_SHUFFLE(3, 2, 3, 2)));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 32],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 48],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx4),
|
||||
_MM_SHUFFLE(3, 2, 3, 2)));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 48],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 40],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx12),
|
||||
_MM_SHUFFLE(3, 2, 3, 2)));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 40],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 56],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx22),
|
||||
_MM_SHUFFLE(3, 2, 3, 2)));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 56],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void rftfsub_128_SSE2(float *a) {
|
||||
const float *c = rdft_w + 32;
|
||||
static void rftfsub_128_SSE2(float* a) {
|
||||
const float* c = rdft_w + 32;
|
||||
int j1, j2, k1, k2;
|
||||
float wkr, wki, xr, xi, yr, yi;
|
||||
|
||||
static const ALIGN16_BEG float ALIGN16_END k_half[4] =
|
||||
{0.5f, 0.5f, 0.5f, 0.5f};
|
||||
static const ALIGN16_BEG float ALIGN16_END
|
||||
k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
|
||||
const __m128 mm_half = _mm_load_ps(k_half);
|
||||
|
||||
// Vectorized code (four at once).
|
||||
// Note: commented number are indexes for the first iteration of the loop.
|
||||
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
|
||||
// Load 'wk'.
|
||||
const __m128 c_j1 = _mm_loadu_ps(&c[ j1]); // 1, 2, 3, 4,
|
||||
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
|
||||
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
|
||||
const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
|
||||
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
|
||||
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
|
||||
const __m128 wkr_ =
|
||||
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
|
||||
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
|
||||
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
|
||||
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
|
||||
// Load and shuffle 'a'.
|
||||
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
|
||||
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
|
||||
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
|
||||
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
|
||||
const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
|
||||
const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
|
||||
const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4,
|
||||
_MM_SHUFFLE(2, 0, 2 ,0)); // 2, 4, 6, 8,
|
||||
const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4,
|
||||
_MM_SHUFFLE(3, 1, 3 ,1)); // 3, 5, 7, 9,
|
||||
const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0,
|
||||
_MM_SHUFFLE(0, 2, 0 ,2)); // 126, 124, 122, 120,
|
||||
const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0,
|
||||
_MM_SHUFFLE(1, 3, 1 ,3)); // 127, 125, 123, 121,
|
||||
const __m128 a_j2_p0 = _mm_shuffle_ps(
|
||||
a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
|
||||
const __m128 a_j2_p1 = _mm_shuffle_ps(
|
||||
a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
|
||||
const __m128 a_k2_p0 = _mm_shuffle_ps(
|
||||
a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
|
||||
const __m128 a_k2_p1 = _mm_shuffle_ps(
|
||||
a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
|
||||
// Calculate 'x'.
|
||||
const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
|
||||
// 2-126, 4-124, 6-122, 8-120,
|
||||
// 2-126, 4-124, 6-122, 8-120,
|
||||
const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
|
||||
// 3-127, 5-125, 7-123, 9-121,
|
||||
// 3-127, 5-125, 7-123, 9-121,
|
||||
// Calculate product into 'y'.
|
||||
// yr = wkr * xr - wki * xi;
|
||||
// yi = wkr * xi + wki * xr;
|
||||
@ -283,12 +281,12 @@ static void rftfsub_128_SSE2(float *a) {
|
||||
const __m128 b_ = _mm_mul_ps(wki_, xi_);
|
||||
const __m128 c_ = _mm_mul_ps(wkr_, xi_);
|
||||
const __m128 d_ = _mm_mul_ps(wki_, xr_);
|
||||
const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
|
||||
const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
|
||||
// Update 'a'.
|
||||
// a[j2 + 0] -= yr;
|
||||
// a[j2 + 1] -= yi;
|
||||
// a[k2 + 0] += yr;
|
||||
const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
|
||||
const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
|
||||
// Update 'a'.
|
||||
// a[j2 + 0] -= yr;
|
||||
// a[j2 + 1] -= yi;
|
||||
// a[k2 + 0] += yr;
|
||||
// a[k2 + 1] -= yi;
|
||||
const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
|
||||
const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_); // 3, 5, 7, 9,
|
||||
@ -296,26 +294,26 @@ static void rftfsub_128_SSE2(float *a) {
|
||||
const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_); // 127, 125, 123, 121,
|
||||
// Shuffle in right order and store.
|
||||
const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
|
||||
// 2, 3, 4, 5,
|
||||
// 2, 3, 4, 5,
|
||||
const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
|
||||
// 6, 7, 8, 9,
|
||||
// 6, 7, 8, 9,
|
||||
const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
|
||||
// 122, 123, 120, 121,
|
||||
// 122, 123, 120, 121,
|
||||
const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
|
||||
// 126, 127, 124, 125,
|
||||
const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt,
|
||||
_MM_SHUFFLE(1, 0, 3 ,2)); // 120, 121, 122, 123,
|
||||
const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt,
|
||||
_MM_SHUFFLE(1, 0, 3 ,2)); // 124, 125, 126, 127,
|
||||
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
|
||||
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
|
||||
// 126, 127, 124, 125,
|
||||
const __m128 a_k2_0n = _mm_shuffle_ps(
|
||||
a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
|
||||
const __m128 a_k2_4n = _mm_shuffle_ps(
|
||||
a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
|
||||
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
|
||||
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
|
||||
_mm_storeu_ps(&a[122 - j2], a_k2_0n);
|
||||
_mm_storeu_ps(&a[126 - j2], a_k2_4n);
|
||||
}
|
||||
// Scalar code for the remaining items.
|
||||
for (; j2 < 64; j1 += 1, j2 += 2) {
|
||||
k2 = 128 - j2;
|
||||
k1 = 32 - j1;
|
||||
k1 = 32 - j1;
|
||||
wkr = 0.5f - c[k1];
|
||||
wki = c[j1];
|
||||
xr = a[j2 + 0] - a[k2 + 0];
|
||||
@ -329,13 +327,13 @@ static void rftfsub_128_SSE2(float *a) {
|
||||
}
|
||||
}
|
||||
|
||||
static void rftbsub_128_SSE2(float *a) {
|
||||
const float *c = rdft_w + 32;
|
||||
static void rftbsub_128_SSE2(float* a) {
|
||||
const float* c = rdft_w + 32;
|
||||
int j1, j2, k1, k2;
|
||||
float wkr, wki, xr, xi, yr, yi;
|
||||
|
||||
static const ALIGN16_BEG float ALIGN16_END k_half[4] =
|
||||
{0.5f, 0.5f, 0.5f, 0.5f};
|
||||
static const ALIGN16_BEG float ALIGN16_END
|
||||
k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
|
||||
const __m128 mm_half = _mm_load_ps(k_half);
|
||||
|
||||
a[1] = -a[1];
|
||||
@ -343,30 +341,30 @@ static void rftbsub_128_SSE2(float *a) {
|
||||
// Note: commented number are indexes for the first iteration of the loop.
|
||||
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
|
||||
// Load 'wk'.
|
||||
const __m128 c_j1 = _mm_loadu_ps(&c[ j1]); // 1, 2, 3, 4,
|
||||
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
|
||||
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
|
||||
const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
|
||||
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
|
||||
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
|
||||
const __m128 wkr_ =
|
||||
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
|
||||
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
|
||||
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
|
||||
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
|
||||
// Load and shuffle 'a'.
|
||||
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
|
||||
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
|
||||
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
|
||||
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
|
||||
const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
|
||||
const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
|
||||
const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4,
|
||||
_MM_SHUFFLE(2, 0, 2 ,0)); // 2, 4, 6, 8,
|
||||
const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4,
|
||||
_MM_SHUFFLE(3, 1, 3 ,1)); // 3, 5, 7, 9,
|
||||
const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0,
|
||||
_MM_SHUFFLE(0, 2, 0 ,2)); // 126, 124, 122, 120,
|
||||
const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0,
|
||||
_MM_SHUFFLE(1, 3, 1 ,3)); // 127, 125, 123, 121,
|
||||
const __m128 a_j2_p0 = _mm_shuffle_ps(
|
||||
a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
|
||||
const __m128 a_j2_p1 = _mm_shuffle_ps(
|
||||
a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
|
||||
const __m128 a_k2_p0 = _mm_shuffle_ps(
|
||||
a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
|
||||
const __m128 a_k2_p1 = _mm_shuffle_ps(
|
||||
a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
|
||||
// Calculate 'x'.
|
||||
const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
|
||||
// 2-126, 4-124, 6-122, 8-120,
|
||||
// 2-126, 4-124, 6-122, 8-120,
|
||||
const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
|
||||
// 3-127, 5-125, 7-123, 9-121,
|
||||
// 3-127, 5-125, 7-123, 9-121,
|
||||
// Calculate product into 'y'.
|
||||
// yr = wkr * xr + wki * xi;
|
||||
// yi = wkr * xi - wki * xr;
|
||||
@ -374,12 +372,12 @@ static void rftbsub_128_SSE2(float *a) {
|
||||
const __m128 b_ = _mm_mul_ps(wki_, xi_);
|
||||
const __m128 c_ = _mm_mul_ps(wkr_, xi_);
|
||||
const __m128 d_ = _mm_mul_ps(wki_, xr_);
|
||||
const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
|
||||
const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
|
||||
// Update 'a'.
|
||||
// a[j2 + 0] = a[j2 + 0] - yr;
|
||||
// a[j2 + 1] = yi - a[j2 + 1];
|
||||
// a[k2 + 0] = yr + a[k2 + 0];
|
||||
const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
|
||||
const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
|
||||
// Update 'a'.
|
||||
// a[j2 + 0] = a[j2 + 0] - yr;
|
||||
// a[j2 + 1] = yi - a[j2 + 1];
|
||||
// a[k2 + 0] = yr + a[k2 + 0];
|
||||
// a[k2 + 1] = yi - a[k2 + 1];
|
||||
const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
|
||||
const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9,
|
||||
@ -387,26 +385,26 @@ static void rftbsub_128_SSE2(float *a) {
|
||||
const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121,
|
||||
// Shuffle in right order and store.
|
||||
const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
|
||||
// 2, 3, 4, 5,
|
||||
// 2, 3, 4, 5,
|
||||
const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
|
||||
// 6, 7, 8, 9,
|
||||
// 6, 7, 8, 9,
|
||||
const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
|
||||
// 122, 123, 120, 121,
|
||||
// 122, 123, 120, 121,
|
||||
const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
|
||||
// 126, 127, 124, 125,
|
||||
const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt,
|
||||
_MM_SHUFFLE(1, 0, 3 ,2)); // 120, 121, 122, 123,
|
||||
const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt,
|
||||
_MM_SHUFFLE(1, 0, 3 ,2)); // 124, 125, 126, 127,
|
||||
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
|
||||
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
|
||||
// 126, 127, 124, 125,
|
||||
const __m128 a_k2_0n = _mm_shuffle_ps(
|
||||
a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
|
||||
const __m128 a_k2_4n = _mm_shuffle_ps(
|
||||
a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
|
||||
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
|
||||
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
|
||||
_mm_storeu_ps(&a[122 - j2], a_k2_0n);
|
||||
_mm_storeu_ps(&a[126 - j2], a_k2_4n);
|
||||
}
|
||||
// Scalar code for the remaining items.
|
||||
for (; j2 < 64; j1 += 1, j2 += 2) {
|
||||
k2 = 128 - j2;
|
||||
k1 = 32 - j1;
|
||||
k1 = 32 - j1;
|
||||
wkr = 0.5f - c[k1];
|
||||
wki = c[j1];
|
||||
xr = a[j2 + 0] - a[k2 + 0];
|
||||
@ -427,5 +425,3 @@ void aec_rdft_init_sse2(void) {
|
||||
rftfsub_128 = rftfsub_128_SSE2;
|
||||
rftbsub_128 = rftbsub_128_SSE2;
|
||||
}
|
||||
|
||||
#endif // WEBRTC_USE_SS2
|
||||
|
209
webrtc/modules/audio_processing/aec/aec_resampler.c
Normal file
209
webrtc/modules/audio_processing/aec/aec_resampler.c
Normal file
@ -0,0 +1,209 @@
|
||||
/*
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
/* Resamples a signal to an arbitrary rate. Used by the AEC to compensate for
|
||||
* clock skew by resampling the farend signal.
|
||||
*/
|
||||
|
||||
#include "webrtc/modules/audio_processing/aec/aec_resampler.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "webrtc/modules/audio_processing/aec/aec_core.h"
|
||||
|
||||
enum {
|
||||
kEstimateLengthFrames = 400
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
float buffer[kResamplerBufferSize];
|
||||
float position;
|
||||
|
||||
int deviceSampleRateHz;
|
||||
int skewData[kEstimateLengthFrames];
|
||||
int skewDataIndex;
|
||||
float skewEstimate;
|
||||
} AecResampler;
|
||||
|
||||
static int EstimateSkew(const int* rawSkew,
|
||||
int size,
|
||||
int absLimit,
|
||||
float* skewEst);
|
||||
|
||||
void* WebRtcAec_CreateResampler() {
|
||||
return malloc(sizeof(AecResampler));
|
||||
}
|
||||
|
||||
int WebRtcAec_InitResampler(void* resampInst, int deviceSampleRateHz) {
|
||||
AecResampler* obj = (AecResampler*)resampInst;
|
||||
memset(obj->buffer, 0, sizeof(obj->buffer));
|
||||
obj->position = 0.0;
|
||||
|
||||
obj->deviceSampleRateHz = deviceSampleRateHz;
|
||||
memset(obj->skewData, 0, sizeof(obj->skewData));
|
||||
obj->skewDataIndex = 0;
|
||||
obj->skewEstimate = 0.0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void WebRtcAec_FreeResampler(void* resampInst) {
|
||||
AecResampler* obj = (AecResampler*)resampInst;
|
||||
free(obj);
|
||||
}
|
||||
|
||||
void WebRtcAec_ResampleLinear(void* resampInst,
|
||||
const float* inspeech,
|
||||
size_t size,
|
||||
float skew,
|
||||
float* outspeech,
|
||||
size_t* size_out) {
|
||||
AecResampler* obj = (AecResampler*)resampInst;
|
||||
|
||||
float* y;
|
||||
float be, tnew;
|
||||
size_t tn, mm;
|
||||
|
||||
assert(size <= 2 * FRAME_LEN);
|
||||
assert(resampInst != NULL);
|
||||
assert(inspeech != NULL);
|
||||
assert(outspeech != NULL);
|
||||
assert(size_out != NULL);
|
||||
|
||||
// Add new frame data in lookahead
|
||||
memcpy(&obj->buffer[FRAME_LEN + kResamplingDelay],
|
||||
inspeech,
|
||||
size * sizeof(inspeech[0]));
|
||||
|
||||
// Sample rate ratio
|
||||
be = 1 + skew;
|
||||
|
||||
// Loop over input frame
|
||||
mm = 0;
|
||||
y = &obj->buffer[FRAME_LEN]; // Point at current frame
|
||||
|
||||
tnew = be * mm + obj->position;
|
||||
tn = (size_t)tnew;
|
||||
|
||||
while (tn < size) {
|
||||
|
||||
// Interpolation
|
||||
outspeech[mm] = y[tn] + (tnew - tn) * (y[tn + 1] - y[tn]);
|
||||
mm++;
|
||||
|
||||
tnew = be * mm + obj->position;
|
||||
tn = (int)tnew;
|
||||
}
|
||||
|
||||
*size_out = mm;
|
||||
obj->position += (*size_out) * be - size;
|
||||
|
||||
// Shift buffer
|
||||
memmove(obj->buffer,
|
||||
&obj->buffer[size],
|
||||
(kResamplerBufferSize - size) * sizeof(obj->buffer[0]));
|
||||
}
|
||||
|
||||
int WebRtcAec_GetSkew(void* resampInst, int rawSkew, float* skewEst) {
|
||||
AecResampler* obj = (AecResampler*)resampInst;
|
||||
int err = 0;
|
||||
|
||||
if (obj->skewDataIndex < kEstimateLengthFrames) {
|
||||
obj->skewData[obj->skewDataIndex] = rawSkew;
|
||||
obj->skewDataIndex++;
|
||||
} else if (obj->skewDataIndex == kEstimateLengthFrames) {
|
||||
err = EstimateSkew(
|
||||
obj->skewData, kEstimateLengthFrames, obj->deviceSampleRateHz, skewEst);
|
||||
obj->skewEstimate = *skewEst;
|
||||
obj->skewDataIndex++;
|
||||
} else {
|
||||
*skewEst = obj->skewEstimate;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int EstimateSkew(const int* rawSkew,
|
||||
int size,
|
||||
int deviceSampleRateHz,
|
||||
float* skewEst) {
|
||||
const int absLimitOuter = (int)(0.04f * deviceSampleRateHz);
|
||||
const int absLimitInner = (int)(0.0025f * deviceSampleRateHz);
|
||||
int i = 0;
|
||||
int n = 0;
|
||||
float rawAvg = 0;
|
||||
float err = 0;
|
||||
float rawAbsDev = 0;
|
||||
int upperLimit = 0;
|
||||
int lowerLimit = 0;
|
||||
float cumSum = 0;
|
||||
float x = 0;
|
||||
float x2 = 0;
|
||||
float y = 0;
|
||||
float xy = 0;
|
||||
float xAvg = 0;
|
||||
float denom = 0;
|
||||
float skew = 0;
|
||||
|
||||
*skewEst = 0; // Set in case of error below.
|
||||
for (i = 0; i < size; i++) {
|
||||
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
|
||||
n++;
|
||||
rawAvg += rawSkew[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (n == 0) {
|
||||
return -1;
|
||||
}
|
||||
assert(n > 0);
|
||||
rawAvg /= n;
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
|
||||
err = rawSkew[i] - rawAvg;
|
||||
rawAbsDev += err >= 0 ? err : -err;
|
||||
}
|
||||
}
|
||||
assert(n > 0);
|
||||
rawAbsDev /= n;
|
||||
upperLimit = (int)(rawAvg + 5 * rawAbsDev + 1); // +1 for ceiling.
|
||||
lowerLimit = (int)(rawAvg - 5 * rawAbsDev - 1); // -1 for floor.
|
||||
|
||||
n = 0;
|
||||
for (i = 0; i < size; i++) {
|
||||
if ((rawSkew[i] < absLimitInner && rawSkew[i] > -absLimitInner) ||
|
||||
(rawSkew[i] < upperLimit && rawSkew[i] > lowerLimit)) {
|
||||
n++;
|
||||
cumSum += rawSkew[i];
|
||||
x += n;
|
||||
x2 += n * n;
|
||||
y += cumSum;
|
||||
xy += n * cumSum;
|
||||
}
|
||||
}
|
||||
|
||||
if (n == 0) {
|
||||
return -1;
|
||||
}
|
||||
assert(n > 0);
|
||||
xAvg = x / n;
|
||||
denom = x2 - xAvg * x;
|
||||
|
||||
if (denom != 0) {
|
||||
skew = (xy - xAvg * y) / denom;
|
||||
}
|
||||
|
||||
*skewEst = skew;
|
||||
return 0;
|
||||
}
|
39
webrtc/modules/audio_processing/aec/aec_resampler.h
Normal file
39
webrtc/modules/audio_processing/aec/aec_resampler.h
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
|
||||
|
||||
#include "webrtc/modules/audio_processing/aec/aec_core.h"
|
||||
|
||||
enum {
|
||||
kResamplingDelay = 1
|
||||
};
|
||||
enum {
|
||||
kResamplerBufferSize = FRAME_LEN * 4
|
||||
};
|
||||
|
||||
// Unless otherwise specified, functions return 0 on success and -1 on error.
|
||||
void* WebRtcAec_CreateResampler(); // Returns NULL on error.
|
||||
int WebRtcAec_InitResampler(void* resampInst, int deviceSampleRateHz);
|
||||
void WebRtcAec_FreeResampler(void* resampInst);
|
||||
|
||||
// Estimates skew from raw measurement.
|
||||
int WebRtcAec_GetSkew(void* resampInst, int rawSkew, float* skewEst);
|
||||
|
||||
// Resamples input using linear interpolation.
|
||||
void WebRtcAec_ResampleLinear(void* resampInst,
|
||||
const float* inspeech,
|
||||
size_t size,
|
||||
float skew,
|
||||
float* outspeech,
|
||||
size_t* size_out);
|
||||
|
||||
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,67 @@
|
||||
/*
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
|
||||
|
||||
#include "webrtc/common_audio/ring_buffer.h"
|
||||
#include "webrtc/modules/audio_processing/aec/aec_core.h"
|
||||
|
||||
typedef struct {
|
||||
int delayCtr;
|
||||
int sampFreq;
|
||||
int splitSampFreq;
|
||||
int scSampFreq;
|
||||
float sampFactor; // scSampRate / sampFreq
|
||||
short skewMode;
|
||||
int bufSizeStart;
|
||||
int knownDelay;
|
||||
int rate_factor;
|
||||
|
||||
short initFlag; // indicates if AEC has been initialized
|
||||
|
||||
// Variables used for averaging far end buffer size
|
||||
short counter;
|
||||
int sum;
|
||||
short firstVal;
|
||||
short checkBufSizeCtr;
|
||||
|
||||
// Variables used for delay shifts
|
||||
short msInSndCardBuf;
|
||||
short filtDelay; // Filtered delay estimate.
|
||||
int timeForDelayChange;
|
||||
int startup_phase;
|
||||
int checkBuffSize;
|
||||
short lastDelayDiff;
|
||||
|
||||
#ifdef WEBRTC_AEC_DEBUG_DUMP
|
||||
FILE* bufFile;
|
||||
FILE* delayFile;
|
||||
FILE* skewFile;
|
||||
#endif
|
||||
|
||||
// Structures
|
||||
void* resampler;
|
||||
|
||||
int skewFrCtr;
|
||||
int resample; // if the skew is small enough we don't resample
|
||||
int highSkewCtr;
|
||||
float skew;
|
||||
|
||||
RingBuffer* far_pre_buf; // Time domain far-end pre-buffer.
|
||||
|
||||
int lastError;
|
||||
|
||||
int farend_started;
|
||||
|
||||
AecCore* aec;
|
||||
} Aec;
|
||||
|
||||
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
|
245
webrtc/modules/audio_processing/aec/include/echo_cancellation.h
Normal file
245
webrtc/modules/audio_processing/aec/include/echo_cancellation.h
Normal file
@ -0,0 +1,245 @@
|
||||
/*
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "webrtc/typedefs.h"
|
||||
|
||||
// Errors
|
||||
#define AEC_UNSPECIFIED_ERROR 12000
|
||||
#define AEC_UNSUPPORTED_FUNCTION_ERROR 12001
|
||||
#define AEC_UNINITIALIZED_ERROR 12002
|
||||
#define AEC_NULL_POINTER_ERROR 12003
|
||||
#define AEC_BAD_PARAMETER_ERROR 12004
|
||||
|
||||
// Warnings
|
||||
#define AEC_BAD_PARAMETER_WARNING 12050
|
||||
|
||||
enum {
|
||||
kAecNlpConservative = 0,
|
||||
kAecNlpModerate,
|
||||
kAecNlpAggressive
|
||||
};
|
||||
|
||||
enum {
|
||||
kAecFalse = 0,
|
||||
kAecTrue
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
int16_t nlpMode; // default kAecNlpModerate
|
||||
int16_t skewMode; // default kAecFalse
|
||||
int16_t metricsMode; // default kAecFalse
|
||||
int delay_logging; // default kAecFalse
|
||||
// float realSkew;
|
||||
} AecConfig;
|
||||
|
||||
typedef struct {
|
||||
int instant;
|
||||
int average;
|
||||
int max;
|
||||
int min;
|
||||
} AecLevel;
|
||||
|
||||
typedef struct {
|
||||
AecLevel rerl;
|
||||
AecLevel erl;
|
||||
AecLevel erle;
|
||||
AecLevel aNlp;
|
||||
} AecMetrics;
|
||||
|
||||
struct AecCore;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Allocates the memory needed by the AEC. The memory needs to be initialized
|
||||
* separately using the WebRtcAec_Init() function. Returns a pointer to the
|
||||
* object or NULL on error.
|
||||
*/
|
||||
void* WebRtcAec_Create();
|
||||
|
||||
/*
|
||||
* This function releases the memory allocated by WebRtcAec_Create().
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void* aecInst Pointer to the AEC instance
|
||||
*/
|
||||
void WebRtcAec_Free(void* aecInst);
|
||||
|
||||
/*
|
||||
* Initializes an AEC instance.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void* aecInst Pointer to the AEC instance
|
||||
* int32_t sampFreq Sampling frequency of data
|
||||
* int32_t scSampFreq Soundcard sampling frequency
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* int32_t return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
int32_t WebRtcAec_Init(void* aecInst, int32_t sampFreq, int32_t scSampFreq);
|
||||
|
||||
/*
|
||||
* Inserts an 80 or 160 sample block of data into the farend buffer.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void* aecInst Pointer to the AEC instance
|
||||
* const float* farend In buffer containing one frame of
|
||||
* farend signal for L band
|
||||
* int16_t nrOfSamples Number of samples in farend buffer
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* int32_t return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
int32_t WebRtcAec_BufferFarend(void* aecInst,
|
||||
const float* farend,
|
||||
size_t nrOfSamples);
|
||||
|
||||
/*
|
||||
* Runs the echo canceller on an 80 or 160 sample blocks of data.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void* aecInst Pointer to the AEC instance
|
||||
* float* const* nearend In buffer containing one frame of
|
||||
* nearend+echo signal for each band
|
||||
* int num_bands Number of bands in nearend buffer
|
||||
* int16_t nrOfSamples Number of samples in nearend buffer
|
||||
* int16_t msInSndCardBuf Delay estimate for sound card and
|
||||
* system buffers
|
||||
* int16_t skew Difference between number of samples played
|
||||
* and recorded at the soundcard (for clock skew
|
||||
* compensation)
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* float* const* out Out buffer, one frame of processed nearend
|
||||
* for each band
|
||||
* int32_t return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
int32_t WebRtcAec_Process(void* aecInst,
|
||||
const float* const* nearend,
|
||||
size_t num_bands,
|
||||
float* const* out,
|
||||
size_t nrOfSamples,
|
||||
int16_t msInSndCardBuf,
|
||||
int32_t skew);
|
||||
|
||||
/*
|
||||
* This function enables the user to set certain parameters on-the-fly.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void* handle Pointer to the AEC instance
|
||||
* AecConfig config Config instance that contains all
|
||||
* properties to be set
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* int return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
int WebRtcAec_set_config(void* handle, AecConfig config);
|
||||
|
||||
/*
|
||||
* Gets the current echo status of the nearend signal.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void* handle Pointer to the AEC instance
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* int* status 0: Almost certainly nearend single-talk
|
||||
* 1: Might not be neared single-talk
|
||||
* int return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
int WebRtcAec_get_echo_status(void* handle, int* status);
|
||||
|
||||
/*
|
||||
* Gets the current echo metrics for the session.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void* handle Pointer to the AEC instance
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* AecMetrics* metrics Struct which will be filled out with the
|
||||
* current echo metrics.
|
||||
* int return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
int WebRtcAec_GetMetrics(void* handle, AecMetrics* metrics);
|
||||
|
||||
/*
|
||||
* Gets the current delay metrics for the session.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void* handle Pointer to the AEC instance
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* int* median Delay median value.
|
||||
* int* std Delay standard deviation.
|
||||
* float* fraction_poor_delays Fraction of the delay estimates that may
|
||||
* cause the AEC to perform poorly.
|
||||
*
|
||||
* int return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
int WebRtcAec_GetDelayMetrics(void* handle,
|
||||
int* median,
|
||||
int* std,
|
||||
float* fraction_poor_delays);
|
||||
|
||||
/*
|
||||
* Gets the last error code.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void* aecInst Pointer to the AEC instance
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* int32_t return 11000-11100: error code
|
||||
*/
|
||||
int32_t WebRtcAec_get_error_code(void* aecInst);
|
||||
|
||||
// Returns a pointer to the low level AEC handle.
|
||||
//
|
||||
// Input:
|
||||
// - handle : Pointer to the AEC instance.
|
||||
//
|
||||
// Return value:
|
||||
// - AecCore pointer : NULL for error.
|
||||
//
|
||||
struct AecCore* WebRtcAec_aec_core(void* handle);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
|
@ -1,278 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_INTERFACE_ECHO_CANCELLATION_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_INTERFACE_ECHO_CANCELLATION_H_
|
||||
|
||||
#include "typedefs.h"
|
||||
|
||||
// Errors
|
||||
#define AEC_UNSPECIFIED_ERROR 12000
|
||||
#define AEC_UNSUPPORTED_FUNCTION_ERROR 12001
|
||||
#define AEC_UNINITIALIZED_ERROR 12002
|
||||
#define AEC_NULL_POINTER_ERROR 12003
|
||||
#define AEC_BAD_PARAMETER_ERROR 12004
|
||||
|
||||
// Warnings
|
||||
#define AEC_BAD_PARAMETER_WARNING 12050
|
||||
|
||||
enum {
|
||||
kAecNlpConservative = 0,
|
||||
kAecNlpModerate,
|
||||
kAecNlpAggressive
|
||||
};
|
||||
|
||||
enum {
|
||||
kAecFalse = 0,
|
||||
kAecTrue
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
WebRtc_Word16 nlpMode; // default kAecNlpModerate
|
||||
WebRtc_Word16 skewMode; // default kAecFalse
|
||||
WebRtc_Word16 metricsMode; // default kAecFalse
|
||||
int delay_logging; // default kAecFalse
|
||||
//float realSkew;
|
||||
} AecConfig;
|
||||
|
||||
typedef struct {
|
||||
WebRtc_Word16 instant;
|
||||
WebRtc_Word16 average;
|
||||
WebRtc_Word16 max;
|
||||
WebRtc_Word16 min;
|
||||
} AecLevel;
|
||||
|
||||
typedef struct {
|
||||
AecLevel rerl;
|
||||
AecLevel erl;
|
||||
AecLevel erle;
|
||||
AecLevel aNlp;
|
||||
} AecMetrics;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Allocates the memory needed by the AEC. The memory needs to be initialized
|
||||
* separately using the WebRtcAec_Init() function.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void **aecInst Pointer to the AEC instance to be created
|
||||
* and initialized
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* WebRtc_Word32 return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
WebRtc_Word32 WebRtcAec_Create(void **aecInst);
|
||||
|
||||
/*
|
||||
* This function releases the memory allocated by WebRtcAec_Create().
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void *aecInst Pointer to the AEC instance
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* WebRtc_Word32 return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
WebRtc_Word32 WebRtcAec_Free(void *aecInst);
|
||||
|
||||
/*
|
||||
* Initializes an AEC instance.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void *aecInst Pointer to the AEC instance
|
||||
* WebRtc_Word32 sampFreq Sampling frequency of data
|
||||
* WebRtc_Word32 scSampFreq Soundcard sampling frequency
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* WebRtc_Word32 return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
WebRtc_Word32 WebRtcAec_Init(void *aecInst,
|
||||
WebRtc_Word32 sampFreq,
|
||||
WebRtc_Word32 scSampFreq);
|
||||
|
||||
/*
|
||||
* Inserts an 80 or 160 sample block of data into the farend buffer.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void *aecInst Pointer to the AEC instance
|
||||
* WebRtc_Word16 *farend In buffer containing one frame of
|
||||
* farend signal for L band
|
||||
* WebRtc_Word16 nrOfSamples Number of samples in farend buffer
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* WebRtc_Word32 return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
WebRtc_Word32 WebRtcAec_BufferFarend(void *aecInst,
|
||||
const WebRtc_Word16 *farend,
|
||||
WebRtc_Word16 nrOfSamples);
|
||||
|
||||
/*
|
||||
* Runs the echo canceller on an 80 or 160 sample blocks of data.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void *aecInst Pointer to the AEC instance
|
||||
* WebRtc_Word16 *nearend In buffer containing one frame of
|
||||
* nearend+echo signal for L band
|
||||
* WebRtc_Word16 *nearendH In buffer containing one frame of
|
||||
* nearend+echo signal for H band
|
||||
* WebRtc_Word16 nrOfSamples Number of samples in nearend buffer
|
||||
* WebRtc_Word16 msInSndCardBuf Delay estimate for sound card and
|
||||
* system buffers
|
||||
* WebRtc_Word16 skew Difference between number of samples played
|
||||
* and recorded at the soundcard (for clock skew
|
||||
* compensation)
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* WebRtc_Word16 *out Out buffer, one frame of processed nearend
|
||||
* for L band
|
||||
* WebRtc_Word16 *outH Out buffer, one frame of processed nearend
|
||||
* for H band
|
||||
* WebRtc_Word32 return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
WebRtc_Word32 WebRtcAec_Process(void *aecInst,
|
||||
const WebRtc_Word16 *nearend,
|
||||
const WebRtc_Word16 *nearendH,
|
||||
WebRtc_Word16 *out,
|
||||
WebRtc_Word16 *outH,
|
||||
WebRtc_Word16 nrOfSamples,
|
||||
WebRtc_Word16 msInSndCardBuf,
|
||||
WebRtc_Word32 skew);
|
||||
|
||||
/*
|
||||
* This function enables the user to set certain parameters on-the-fly.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void *aecInst Pointer to the AEC instance
|
||||
* AecConfig config Config instance that contains all
|
||||
* properties to be set
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* WebRtc_Word32 return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
WebRtc_Word32 WebRtcAec_set_config(void *aecInst, AecConfig config);
|
||||
|
||||
/*
|
||||
* Gets the on-the-fly paramters.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void *aecInst Pointer to the AEC instance
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* AecConfig *config Pointer to the config instance that
|
||||
* all properties will be written to
|
||||
* WebRtc_Word32 return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
WebRtc_Word32 WebRtcAec_get_config(void *aecInst, AecConfig *config);
|
||||
|
||||
/*
|
||||
* Gets the current echo status of the nearend signal.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void *aecInst Pointer to the AEC instance
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* WebRtc_Word16 *status 0: Almost certainly nearend single-talk
|
||||
* 1: Might not be neared single-talk
|
||||
* WebRtc_Word32 return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
WebRtc_Word32 WebRtcAec_get_echo_status(void *aecInst, WebRtc_Word16 *status);
|
||||
|
||||
/*
|
||||
* Gets the current echo metrics for the session.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void *aecInst Pointer to the AEC instance
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* AecMetrics *metrics Struct which will be filled out with the
|
||||
* current echo metrics.
|
||||
* WebRtc_Word32 return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
WebRtc_Word32 WebRtcAec_GetMetrics(void *aecInst, AecMetrics *metrics);
|
||||
|
||||
/*
|
||||
* Gets the current delay metrics for the session.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void* handle Pointer to the AEC instance
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* int* median Delay median value.
|
||||
* int* std Delay standard deviation.
|
||||
*
|
||||
* int return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
int WebRtcAec_GetDelayMetrics(void* handle, int* median, int* std);
|
||||
|
||||
/*
|
||||
* Gets the last error code.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* void *aecInst Pointer to the AEC instance
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* WebRtc_Word32 return 11000-11100: error code
|
||||
*/
|
||||
WebRtc_Word32 WebRtcAec_get_error_code(void *aecInst);
|
||||
|
||||
/*
|
||||
* Gets a version string.
|
||||
*
|
||||
* Inputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* char *versionStr Pointer to a string array
|
||||
* WebRtc_Word16 len The maximum length of the string
|
||||
*
|
||||
* Outputs Description
|
||||
* -------------------------------------------------------------------
|
||||
* WebRtc_Word8 *versionStr Pointer to a string array
|
||||
* WebRtc_Word32 return 0: OK
|
||||
* -1: error
|
||||
*/
|
||||
WebRtc_Word32 WebRtcAec_get_version(WebRtc_Word8 *versionStr, WebRtc_Word16 len);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif /* WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_INTERFACE_ECHO_CANCELLATION_H_ */
|
@ -1,233 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
/* Resamples a signal to an arbitrary rate. Used by the AEC to compensate for clock
|
||||
* skew by resampling the farend signal.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "resampler.h"
|
||||
#include "aec_core.h"
|
||||
|
||||
enum { kFrameBufferSize = FRAME_LEN * 4 };
|
||||
enum { kEstimateLengthFrames = 400 };
|
||||
|
||||
typedef struct {
|
||||
short buffer[kFrameBufferSize];
|
||||
float position;
|
||||
|
||||
int deviceSampleRateHz;
|
||||
int skewData[kEstimateLengthFrames];
|
||||
int skewDataIndex;
|
||||
float skewEstimate;
|
||||
} resampler_t;
|
||||
|
||||
static int EstimateSkew(const int* rawSkew,
|
||||
int size,
|
||||
int absLimit,
|
||||
float *skewEst);
|
||||
|
||||
int WebRtcAec_CreateResampler(void **resampInst)
|
||||
{
|
||||
resampler_t *obj = malloc(sizeof(resampler_t));
|
||||
*resampInst = obj;
|
||||
if (obj == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int WebRtcAec_InitResampler(void *resampInst, int deviceSampleRateHz)
|
||||
{
|
||||
resampler_t *obj = (resampler_t*) resampInst;
|
||||
memset(obj->buffer, 0, sizeof(obj->buffer));
|
||||
obj->position = 0.0;
|
||||
|
||||
obj->deviceSampleRateHz = deviceSampleRateHz;
|
||||
memset(obj->skewData, 0, sizeof(obj->skewData));
|
||||
obj->skewDataIndex = 0;
|
||||
obj->skewEstimate = 0.0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int WebRtcAec_FreeResampler(void *resampInst)
|
||||
{
|
||||
resampler_t *obj = (resampler_t*) resampInst;
|
||||
free(obj);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int WebRtcAec_ResampleLinear(void *resampInst,
|
||||
const short *inspeech,
|
||||
int size,
|
||||
float skew,
|
||||
short *outspeech)
|
||||
{
|
||||
resampler_t *obj = (resampler_t*) resampInst;
|
||||
|
||||
short *y;
|
||||
float be, tnew, interp;
|
||||
int tn, outsize, mm;
|
||||
|
||||
if (size < 0 || size > 2 * FRAME_LEN) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Add new frame data in lookahead
|
||||
memcpy(&obj->buffer[FRAME_LEN + kResamplingDelay],
|
||||
inspeech,
|
||||
size * sizeof(short));
|
||||
|
||||
// Sample rate ratio
|
||||
be = 1 + skew;
|
||||
|
||||
// Loop over input frame
|
||||
mm = 0;
|
||||
y = &obj->buffer[FRAME_LEN]; // Point at current frame
|
||||
|
||||
tnew = be * mm + obj->position;
|
||||
tn = (int) tnew;
|
||||
|
||||
while (tn < size) {
|
||||
|
||||
// Interpolation
|
||||
interp = y[tn] + (tnew - tn) * (y[tn+1] - y[tn]);
|
||||
|
||||
if (interp > 32767) {
|
||||
interp = 32767;
|
||||
}
|
||||
else if (interp < -32768) {
|
||||
interp = -32768;
|
||||
}
|
||||
|
||||
outspeech[mm] = (short) interp;
|
||||
mm++;
|
||||
|
||||
tnew = be * mm + obj->position;
|
||||
tn = (int) tnew;
|
||||
}
|
||||
|
||||
outsize = mm;
|
||||
obj->position += outsize * be - size;
|
||||
|
||||
// Shift buffer
|
||||
memmove(obj->buffer,
|
||||
&obj->buffer[size],
|
||||
(kFrameBufferSize - size) * sizeof(short));
|
||||
|
||||
return outsize;
|
||||
}
|
||||
|
||||
int WebRtcAec_GetSkew(void *resampInst, int rawSkew, float *skewEst)
|
||||
{
|
||||
resampler_t *obj = (resampler_t*)resampInst;
|
||||
int err = 0;
|
||||
|
||||
if (obj->skewDataIndex < kEstimateLengthFrames) {
|
||||
obj->skewData[obj->skewDataIndex] = rawSkew;
|
||||
obj->skewDataIndex++;
|
||||
}
|
||||
else if (obj->skewDataIndex == kEstimateLengthFrames) {
|
||||
err = EstimateSkew(obj->skewData,
|
||||
kEstimateLengthFrames,
|
||||
obj->deviceSampleRateHz,
|
||||
skewEst);
|
||||
obj->skewEstimate = *skewEst;
|
||||
obj->skewDataIndex++;
|
||||
}
|
||||
else {
|
||||
*skewEst = obj->skewEstimate;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int EstimateSkew(const int* rawSkew,
|
||||
int size,
|
||||
int deviceSampleRateHz,
|
||||
float *skewEst)
|
||||
{
|
||||
const int absLimitOuter = (int)(0.04f * deviceSampleRateHz);
|
||||
const int absLimitInner = (int)(0.0025f * deviceSampleRateHz);
|
||||
int i = 0;
|
||||
int n = 0;
|
||||
float rawAvg = 0;
|
||||
float err = 0;
|
||||
float rawAbsDev = 0;
|
||||
int upperLimit = 0;
|
||||
int lowerLimit = 0;
|
||||
float cumSum = 0;
|
||||
float x = 0;
|
||||
float x2 = 0;
|
||||
float y = 0;
|
||||
float xy = 0;
|
||||
float xAvg = 0;
|
||||
float denom = 0;
|
||||
float skew = 0;
|
||||
|
||||
*skewEst = 0; // Set in case of error below.
|
||||
for (i = 0; i < size; i++) {
|
||||
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
|
||||
n++;
|
||||
rawAvg += rawSkew[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (n == 0) {
|
||||
return -1;
|
||||
}
|
||||
assert(n > 0);
|
||||
rawAvg /= n;
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
|
||||
err = rawSkew[i] - rawAvg;
|
||||
rawAbsDev += err >= 0 ? err : -err;
|
||||
}
|
||||
}
|
||||
assert(n > 0);
|
||||
rawAbsDev /= n;
|
||||
upperLimit = (int)(rawAvg + 5 * rawAbsDev + 1); // +1 for ceiling.
|
||||
lowerLimit = (int)(rawAvg - 5 * rawAbsDev - 1); // -1 for floor.
|
||||
|
||||
n = 0;
|
||||
for (i = 0; i < size; i++) {
|
||||
if ((rawSkew[i] < absLimitInner && rawSkew[i] > -absLimitInner) ||
|
||||
(rawSkew[i] < upperLimit && rawSkew[i] > lowerLimit)) {
|
||||
n++;
|
||||
cumSum += rawSkew[i];
|
||||
x += n;
|
||||
x2 += n*n;
|
||||
y += cumSum;
|
||||
xy += n * cumSum;
|
||||
}
|
||||
}
|
||||
|
||||
if (n == 0) {
|
||||
return -1;
|
||||
}
|
||||
assert(n > 0);
|
||||
xAvg = x / n;
|
||||
denom = x2 - xAvg*x;
|
||||
|
||||
if (denom != 0) {
|
||||
skew = (xy - xAvg*y) / denom;
|
||||
}
|
||||
|
||||
*skewEst = skew;
|
||||
return 0;
|
||||
}
|
@ -1,32 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_RESAMPLER_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_RESAMPLER_H_
|
||||
|
||||
enum { kResamplingDelay = 1 };
|
||||
|
||||
// Unless otherwise specified, functions return 0 on success and -1 on error
|
||||
int WebRtcAec_CreateResampler(void **resampInst);
|
||||
int WebRtcAec_InitResampler(void *resampInst, int deviceSampleRateHz);
|
||||
int WebRtcAec_FreeResampler(void *resampInst);
|
||||
|
||||
// Estimates skew from raw measurement.
|
||||
int WebRtcAec_GetSkew(void *resampInst, int rawSkew, float *skewEst);
|
||||
|
||||
// Resamples input using linear interpolation.
|
||||
// Returns size of resampled array.
|
||||
int WebRtcAec_ResampleLinear(void *resampInst,
|
||||
const short *inspeech,
|
||||
int size,
|
||||
float skew,
|
||||
short *outspeech);
|
||||
|
||||
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_RESAMPLER_H_
|
Reference in New Issue
Block a user