Update audio_processing module

Corresponds to upstream commit 524e9b043e7e86fd72353b987c9d5f6a1ebf83e1

Update notes:

 * Pull in third party license file

 * Replace .gypi files with BUILD.gn to keep track of what changes
   upstream

 * Bunch of new filse pulled in as dependencies

 * Won't build yet due to changes needed on top of these
This commit is contained in:
Arun Raghavan
2015-10-13 17:25:22 +05:30
parent 5ae7a5d6cd
commit 753eada3aa
324 changed files with 52533 additions and 16117 deletions

View File

@ -1,16 +0,0 @@
noinst_LTLIBRARIES = libaec.la
libaec_la_SOURCES = interface/echo_cancellation.h \
echo_cancellation.c \
aec_core.h \
aec_core.c \
aec_core_sse2.c \
aec_rdft.h \
aec_rdft.c \
aec_rdft_sse2.c \
resampler.h \
resampler.c
libaec_la_CFLAGS = $(AM_CFLAGS) $(COMMON_CFLAGS) \
-I$(top_srcdir)/src/common_audio/signal_processing_library/main/interface \
-I$(top_srcdir)/src/system_wrappers/interface \
-I$(top_srcdir)/src/modules/audio_processing/utility

View File

@ -1,40 +0,0 @@
# Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
#
# Use of this source code is governed by a BSD-style license
# that can be found in the LICENSE file in the root of the source
# tree. An additional intellectual property rights grant can be found
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
{
'targets': [
{
'target_name': 'aec',
'type': '<(library)',
'dependencies': [
'<(webrtc_root)/common_audio/common_audio.gyp:spl',
'apm_util'
],
'include_dirs': [
'interface',
],
'direct_dependent_settings': {
'include_dirs': [
'interface',
],
},
'sources': [
'interface/echo_cancellation.h',
'echo_cancellation.c',
'aec_core.h',
'aec_core.c',
'aec_core_sse2.c',
'aec_rdft.h',
'aec_rdft.c',
'aec_rdft_sse2.c',
'resampler.h',
'resampler.c',
],
},
],
}

View File

@ -0,0 +1,32 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
#include "webrtc/typedefs.h"
#ifdef _MSC_VER /* visual c++ */
#define ALIGN16_BEG __declspec(align(16))
#define ALIGN16_END
#else /* gcc or icc */
#define ALIGN16_BEG
#define ALIGN16_END __attribute__((aligned(16)))
#endif
extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_sqrtHanning[65];
extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_weightCurve[65];
extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_overDriveCurve[65];
extern const float WebRtcAec_kExtendedSmoothingCoefficients[2][2];
extern const float WebRtcAec_kNormalSmoothingCoefficients[2][2];
extern const float WebRtcAec_kMinFarendPSD;
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -12,29 +12,18 @@
* Specifies the interface for the AEC core.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_
#include <stdio.h>
#include <stddef.h>
#include "signal_processing_library.h"
#include "typedefs.h"
//#define AEC_DEBUG // for recording files
#include "webrtc/typedefs.h"
#define FRAME_LEN 80
#define PART_LEN 64 // Length of partition
#define PART_LEN1 (PART_LEN + 1) // Unique fft coefficients
#define PART_LEN2 (PART_LEN * 2) // Length of partition * 2
#define NR_PART 12 // Number of partitions
#define FILT_LEN (PART_LEN * NR_PART) // Filter length
#define FILT_LEN2 (FILT_LEN * 2) // Double filter length
#define FAR_BUF_LEN (FILT_LEN2 * 2)
#define PREF_BAND_SIZE 24
#define BLOCKL_MAX FRAME_LEN
// Maximum delay in fixed point delay estimator, used for logging
enum {kMaxDelay = 100};
#define PART_LEN 64 // Length of partition
#define PART_LEN1 (PART_LEN + 1) // Unique fft coefficients
#define PART_LEN2 (PART_LEN * 2) // Length of partition * 2
#define NUM_HIGH_BANDS_MAX 2 // Max number of high bands
typedef float complex_t[2];
// For performance reasons, some arrays of complex numbers are replaced by twice
@ -46,136 +35,95 @@ typedef float complex_t[2];
// compile time.
// Metrics
enum {offsetLevel = -100};
enum {
kOffsetLevel = -100
};
typedef struct {
float sfrsum;
int sfrcounter;
float framelevel;
float frsum;
int frcounter;
float minlevel;
float averagelevel;
} power_level_t;
typedef struct Stats {
float instant;
float average;
float min;
float max;
float sum;
float hisum;
float himean;
int counter;
int hicounter;
} Stats;
typedef struct {
float instant;
float average;
float min;
float max;
float sum;
float hisum;
float himean;
int counter;
int hicounter;
} stats_t;
typedef struct AecCore AecCore;
typedef struct {
int farBufWritePos, farBufReadPos;
int knownDelay;
int inSamples, outSamples;
int delayEstCtr;
void *farFrBuf, *nearFrBuf, *outFrBuf;
void *nearFrBufH;
void *outFrBufH;
float xBuf[PART_LEN2]; // farend
float dBuf[PART_LEN2]; // nearend
float eBuf[PART_LEN2]; // error
float dBufH[PART_LEN2]; // nearend
float xPow[PART_LEN1];
float dPow[PART_LEN1];
float dMinPow[PART_LEN1];
float dInitMinPow[PART_LEN1];
float *noisePow;
float xfBuf[2][NR_PART * PART_LEN1]; // farend fft buffer
float wfBuf[2][NR_PART * PART_LEN1]; // filter fft
complex_t sde[PART_LEN1]; // cross-psd of nearend and error
complex_t sxd[PART_LEN1]; // cross-psd of farend and nearend
complex_t xfwBuf[NR_PART * PART_LEN1]; // farend windowed fft buffer
float sx[PART_LEN1], sd[PART_LEN1], se[PART_LEN1]; // far, near and error psd
float hNs[PART_LEN1];
float hNlFbMin, hNlFbLocalMin;
float hNlXdAvgMin;
int hNlNewMin, hNlMinCtr;
float overDrive, overDriveSm;
float targetSupp, minOverDrive;
float outBuf[PART_LEN];
int delayIdx;
short stNearState, echoState;
short divergeState;
int xfBufBlockPos;
short farBuf[FILT_LEN2 * 2];
short mult; // sampling frequency multiple
int sampFreq;
WebRtc_UWord32 seed;
float mu; // stepsize
float errThresh; // error threshold
int noiseEstCtr;
power_level_t farlevel;
power_level_t nearlevel;
power_level_t linoutlevel;
power_level_t nlpoutlevel;
int metricsMode;
int stateCounter;
stats_t erl;
stats_t erle;
stats_t aNlp;
stats_t rerl;
// Quantities to control H band scaling for SWB input
int freq_avg_ic; //initial bin for averaging nlp gain
int flag_Hband_cn; //for comfort noise
float cn_scale_Hband; //scale for comfort noise in H band
int delay_histogram[kMaxDelay];
int delay_logging_enabled;
void* delay_estimator;
#ifdef AEC_DEBUG
FILE *farFile;
FILE *nearFile;
FILE *outFile;
FILE *outLpFile;
#endif
} aec_t;
typedef void (*WebRtcAec_FilterFar_t)(aec_t *aec, float yf[2][PART_LEN1]);
extern WebRtcAec_FilterFar_t WebRtcAec_FilterFar;
typedef void (*WebRtcAec_ScaleErrorSignal_t)(aec_t *aec, float ef[2][PART_LEN1]);
extern WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal;
typedef void (*WebRtcAec_FilterAdaptation_t)
(aec_t *aec, float *fft, float ef[2][PART_LEN1]);
extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
typedef void (*WebRtcAec_OverdriveAndSuppress_t)
(aec_t *aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]);
extern WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;
int WebRtcAec_CreateAec(aec_t **aec);
int WebRtcAec_FreeAec(aec_t *aec);
int WebRtcAec_InitAec(aec_t *aec, int sampFreq);
AecCore* WebRtcAec_CreateAec(); // Returns NULL on error.
void WebRtcAec_FreeAec(AecCore* aec);
int WebRtcAec_InitAec(AecCore* aec, int sampFreq);
void WebRtcAec_InitAec_SSE2(void);
#if defined(MIPS_FPU_LE)
void WebRtcAec_InitAec_mips(void);
#endif
#if defined(WEBRTC_DETECT_NEON) || defined(WEBRTC_HAS_NEON)
void WebRtcAec_InitAec_neon(void);
#endif
void WebRtcAec_InitMetrics(aec_t *aec);
void WebRtcAec_ProcessFrame(aec_t *aec, const short *farend,
const short *nearend, const short *nearendH,
short *out, short *outH,
int knownDelay);
void WebRtcAec_BufferFarendPartition(AecCore* aec, const float* farend);
void WebRtcAec_ProcessFrames(AecCore* aec,
const float* const* nearend,
size_t num_bands,
size_t num_samples,
int knownDelay,
float* const* out);
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
// A helper function to call WebRtc_MoveReadPtr() for all far-end buffers.
// Returns the number of elements moved, and adjusts |system_delay| by the
// corresponding amount in ms.
int WebRtcAec_MoveFarReadPtr(AecCore* aec, int elements);
// Calculates the median, standard deviation and amount of poor values among the
// delay estimates aggregated up to the first call to the function. After that
// first call the metrics are aggregated and updated every second. With poor
// values we mean values that most likely will cause the AEC to perform poorly.
// TODO(bjornv): Consider changing tests and tools to handle constant
// constant aggregation window throughout the session instead.
int WebRtcAec_GetDelayMetricsCore(AecCore* self, int* median, int* std,
float* fraction_poor_delays);
// Returns the echo state (1: echo, 0: no echo).
int WebRtcAec_echo_state(AecCore* self);
// Gets statistics of the echo metrics ERL, ERLE, A_NLP.
void WebRtcAec_GetEchoStats(AecCore* self,
Stats* erl,
Stats* erle,
Stats* a_nlp);
#ifdef WEBRTC_AEC_DEBUG_DUMP
void* WebRtcAec_far_time_buf(AecCore* self);
#endif
// Sets local configuration modes.
void WebRtcAec_SetConfigCore(AecCore* self,
int nlp_mode,
int metrics_mode,
int delay_logging);
// Non-zero enables, zero disables.
void WebRtcAec_enable_delay_agnostic(AecCore* self, int enable);
// Returns non-zero if delay agnostic (i.e., signal based delay estimation) is
// enabled and zero if disabled.
int WebRtcAec_delay_agnostic_enabled(AecCore* self);
// Enables or disables extended filter mode. Non-zero enables, zero disables.
void WebRtcAec_enable_extended_filter(AecCore* self, int enable);
// Returns non-zero if extended filter mode is enabled and zero if disabled.
int WebRtcAec_extended_filter_enabled(AecCore* self);
// Returns the current |system_delay|, i.e., the buffered difference between
// far-end and near-end.
int WebRtcAec_system_delay(AecCore* self);
// Sets the |system_delay| to |value|. Note that if the value is changed
// improperly, there can be a performance regression. So it should be used with
// care.
void WebRtcAec_SetSystemDelay(AecCore* self, int delay);
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_

View File

@ -0,0 +1,202 @@
/*
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
#include "webrtc/common_audio/ring_buffer.h"
#include "webrtc/common_audio/wav_file.h"
#include "webrtc/modules/audio_processing/aec/aec_common.h"
#include "webrtc/modules/audio_processing/aec/aec_core.h"
#include "webrtc/typedefs.h"
// Number of partitions for the extended filter mode. The first one is an enum
// to be used in array declarations, as it represents the maximum filter length.
enum {
kExtendedNumPartitions = 32
};
static const int kNormalNumPartitions = 12;
// Delay estimator constants, used for logging and delay compensation if
// if reported delays are disabled.
enum {
kLookaheadBlocks = 15
};
enum {
// 500 ms for 16 kHz which is equivalent with the limit of reported delays.
kHistorySizeBlocks = 125
};
// Extended filter adaptation parameters.
// TODO(ajm): No narrowband tuning yet.
static const float kExtendedMu = 0.4f;
static const float kExtendedErrorThreshold = 1.0e-6f;
typedef struct PowerLevel {
float sfrsum;
int sfrcounter;
float framelevel;
float frsum;
int frcounter;
float minlevel;
float averagelevel;
} PowerLevel;
struct AecCore {
int farBufWritePos, farBufReadPos;
int knownDelay;
int inSamples, outSamples;
int delayEstCtr;
RingBuffer* nearFrBuf;
RingBuffer* outFrBuf;
RingBuffer* nearFrBufH[NUM_HIGH_BANDS_MAX];
RingBuffer* outFrBufH[NUM_HIGH_BANDS_MAX];
float dBuf[PART_LEN2]; // nearend
float eBuf[PART_LEN2]; // error
float dBufH[NUM_HIGH_BANDS_MAX][PART_LEN2]; // nearend
float xPow[PART_LEN1];
float dPow[PART_LEN1];
float dMinPow[PART_LEN1];
float dInitMinPow[PART_LEN1];
float* noisePow;
float xfBuf[2][kExtendedNumPartitions * PART_LEN1]; // farend fft buffer
float wfBuf[2][kExtendedNumPartitions * PART_LEN1]; // filter fft
complex_t sde[PART_LEN1]; // cross-psd of nearend and error
complex_t sxd[PART_LEN1]; // cross-psd of farend and nearend
// Farend windowed fft buffer.
complex_t xfwBuf[kExtendedNumPartitions * PART_LEN1];
float sx[PART_LEN1], sd[PART_LEN1], se[PART_LEN1]; // far, near, error psd
float hNs[PART_LEN1];
float hNlFbMin, hNlFbLocalMin;
float hNlXdAvgMin;
int hNlNewMin, hNlMinCtr;
float overDrive, overDriveSm;
int nlp_mode;
float outBuf[PART_LEN];
int delayIdx;
short stNearState, echoState;
short divergeState;
int xfBufBlockPos;
RingBuffer* far_buf;
RingBuffer* far_buf_windowed;
int system_delay; // Current system delay buffered in AEC.
int mult; // sampling frequency multiple
int sampFreq;
size_t num_bands;
uint32_t seed;
float normal_mu; // stepsize
float normal_error_threshold; // error threshold
int noiseEstCtr;
PowerLevel farlevel;
PowerLevel nearlevel;
PowerLevel linoutlevel;
PowerLevel nlpoutlevel;
int metricsMode;
int stateCounter;
Stats erl;
Stats erle;
Stats aNlp;
Stats rerl;
// Quantities to control H band scaling for SWB input
int freq_avg_ic; // initial bin for averaging nlp gain
int flag_Hband_cn; // for comfort noise
float cn_scale_Hband; // scale for comfort noise in H band
int delay_metrics_delivered;
int delay_histogram[kHistorySizeBlocks];
int num_delay_values;
int delay_median;
int delay_std;
float fraction_poor_delays;
int delay_logging_enabled;
void* delay_estimator_farend;
void* delay_estimator;
// Variables associated with delay correction through signal based delay
// estimation feedback.
int signal_delay_correction;
int previous_delay;
int delay_correction_count;
int shift_offset;
float delay_quality_threshold;
int frame_count;
// 0 = delay agnostic mode (signal based delay correction) disabled.
// Otherwise enabled.
int delay_agnostic_enabled;
// 1 = extended filter mode enabled, 0 = disabled.
int extended_filter_enabled;
// Runtime selection of number of filter partitions.
int num_partitions;
#ifdef WEBRTC_AEC_DEBUG_DUMP
// Sequence number of this AEC instance, so that different instances can
// choose different dump file names.
int instance_index;
// Number of times we've restarted dumping; used to pick new dump file names
// each time.
int debug_dump_count;
RingBuffer* far_time_buf;
rtc_WavWriter* farFile;
rtc_WavWriter* nearFile;
rtc_WavWriter* outFile;
rtc_WavWriter* outLinearFile;
FILE* e_fft_file;
#endif
};
typedef void (*WebRtcAecFilterFar)(AecCore* aec, float yf[2][PART_LEN1]);
extern WebRtcAecFilterFar WebRtcAec_FilterFar;
typedef void (*WebRtcAecScaleErrorSignal)(AecCore* aec, float ef[2][PART_LEN1]);
extern WebRtcAecScaleErrorSignal WebRtcAec_ScaleErrorSignal;
typedef void (*WebRtcAecFilterAdaptation)(AecCore* aec,
float* fft,
float ef[2][PART_LEN1]);
extern WebRtcAecFilterAdaptation WebRtcAec_FilterAdaptation;
typedef void (*WebRtcAecOverdriveAndSuppress)(AecCore* aec,
float hNl[PART_LEN1],
const float hNlFb,
float efw[2][PART_LEN1]);
extern WebRtcAecOverdriveAndSuppress WebRtcAec_OverdriveAndSuppress;
typedef void (*WebRtcAecComfortNoise)(AecCore* aec,
float efw[2][PART_LEN1],
complex_t* comfortNoiseHband,
const float* noisePow,
const float* lambda);
extern WebRtcAecComfortNoise WebRtcAec_ComfortNoise;
typedef void (*WebRtcAecSubBandCoherence)(AecCore* aec,
float efw[2][PART_LEN1],
float xfw[2][PART_LEN1],
float* fft,
float* cohde,
float* cohxd);
extern WebRtcAecSubBandCoherence WebRtcAec_SubbandCoherence;
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_

View File

@ -0,0 +1,774 @@
/*
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* The core AEC algorithm, which is presented with time-aligned signals.
*/
#include "webrtc/modules/audio_processing/aec/aec_core.h"
#include <math.h>
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
static const int flagHbandCn = 1; // flag for adding comfort noise in H band
extern const float WebRtcAec_weightCurve[65];
extern const float WebRtcAec_overDriveCurve[65];
void WebRtcAec_ComfortNoise_mips(AecCore* aec,
float efw[2][PART_LEN1],
complex_t* comfortNoiseHband,
const float* noisePow,
const float* lambda) {
int i, num;
float rand[PART_LEN];
float noise, noiseAvg, tmp, tmpAvg;
int16_t randW16[PART_LEN];
complex_t u[PART_LEN1];
const float pi2 = 6.28318530717959f;
const float pi2t = pi2 / 32768;
// Generate a uniform random array on [0 1]
WebRtcSpl_RandUArray(randW16, PART_LEN, &aec->seed);
int16_t* randWptr = randW16;
float randTemp, randTemp2, randTemp3, randTemp4;
int32_t tmp1s, tmp2s, tmp3s, tmp4s;
for (i = 0; i < PART_LEN; i+=4) {
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"lh %[tmp1s], 0(%[randWptr]) \n\t"
"lh %[tmp2s], 2(%[randWptr]) \n\t"
"lh %[tmp3s], 4(%[randWptr]) \n\t"
"lh %[tmp4s], 6(%[randWptr]) \n\t"
"mtc1 %[tmp1s], %[randTemp] \n\t"
"mtc1 %[tmp2s], %[randTemp2] \n\t"
"mtc1 %[tmp3s], %[randTemp3] \n\t"
"mtc1 %[tmp4s], %[randTemp4] \n\t"
"cvt.s.w %[randTemp], %[randTemp] \n\t"
"cvt.s.w %[randTemp2], %[randTemp2] \n\t"
"cvt.s.w %[randTemp3], %[randTemp3] \n\t"
"cvt.s.w %[randTemp4], %[randTemp4] \n\t"
"addiu %[randWptr], %[randWptr], 8 \n\t"
"mul.s %[randTemp], %[randTemp], %[pi2t] \n\t"
"mul.s %[randTemp2], %[randTemp2], %[pi2t] \n\t"
"mul.s %[randTemp3], %[randTemp3], %[pi2t] \n\t"
"mul.s %[randTemp4], %[randTemp4], %[pi2t] \n\t"
".set pop \n\t"
: [randWptr] "+r" (randWptr), [randTemp] "=&f" (randTemp),
[randTemp2] "=&f" (randTemp2), [randTemp3] "=&f" (randTemp3),
[randTemp4] "=&f" (randTemp4), [tmp1s] "=&r" (tmp1s),
[tmp2s] "=&r" (tmp2s), [tmp3s] "=&r" (tmp3s),
[tmp4s] "=&r" (tmp4s)
: [pi2t] "f" (pi2t)
: "memory"
);
u[i+1][0] = cosf(randTemp);
u[i+1][1] = sinf(randTemp);
u[i+2][0] = cosf(randTemp2);
u[i+2][1] = sinf(randTemp2);
u[i+3][0] = cosf(randTemp3);
u[i+3][1] = sinf(randTemp3);
u[i+4][0] = cosf(randTemp4);
u[i+4][1] = sinf(randTemp4);
}
// Reject LF noise
float* u_ptr = &u[1][0];
float noise2, noise3, noise4;
float tmp1f, tmp2f, tmp3f, tmp4f, tmp5f, tmp6f, tmp7f, tmp8f;
u[0][0] = 0;
u[0][1] = 0;
for (i = 1; i < PART_LEN1; i+=4) {
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"lwc1 %[noise], 4(%[noisePow]) \n\t"
"lwc1 %[noise2], 8(%[noisePow]) \n\t"
"lwc1 %[noise3], 12(%[noisePow]) \n\t"
"lwc1 %[noise4], 16(%[noisePow]) \n\t"
"sqrt.s %[noise], %[noise] \n\t"
"sqrt.s %[noise2], %[noise2] \n\t"
"sqrt.s %[noise3], %[noise3] \n\t"
"sqrt.s %[noise4], %[noise4] \n\t"
"lwc1 %[tmp1f], 0(%[u_ptr]) \n\t"
"lwc1 %[tmp2f], 4(%[u_ptr]) \n\t"
"lwc1 %[tmp3f], 8(%[u_ptr]) \n\t"
"lwc1 %[tmp4f], 12(%[u_ptr]) \n\t"
"lwc1 %[tmp5f], 16(%[u_ptr]) \n\t"
"lwc1 %[tmp6f], 20(%[u_ptr]) \n\t"
"lwc1 %[tmp7f], 24(%[u_ptr]) \n\t"
"lwc1 %[tmp8f], 28(%[u_ptr]) \n\t"
"addiu %[noisePow], %[noisePow], 16 \n\t"
"mul.s %[tmp1f], %[tmp1f], %[noise] \n\t"
"mul.s %[tmp2f], %[tmp2f], %[noise] \n\t"
"mul.s %[tmp3f], %[tmp3f], %[noise2] \n\t"
"mul.s %[tmp4f], %[tmp4f], %[noise2] \n\t"
"mul.s %[tmp5f], %[tmp5f], %[noise3] \n\t"
"mul.s %[tmp6f], %[tmp6f], %[noise3] \n\t"
"swc1 %[tmp1f], 0(%[u_ptr]) \n\t"
"swc1 %[tmp3f], 8(%[u_ptr]) \n\t"
"mul.s %[tmp8f], %[tmp8f], %[noise4] \n\t"
"mul.s %[tmp7f], %[tmp7f], %[noise4] \n\t"
"neg.s %[tmp2f] \n\t"
"neg.s %[tmp4f] \n\t"
"neg.s %[tmp6f] \n\t"
"neg.s %[tmp8f] \n\t"
"swc1 %[tmp5f], 16(%[u_ptr]) \n\t"
"swc1 %[tmp7f], 24(%[u_ptr]) \n\t"
"swc1 %[tmp2f], 4(%[u_ptr]) \n\t"
"swc1 %[tmp4f], 12(%[u_ptr]) \n\t"
"swc1 %[tmp6f], 20(%[u_ptr]) \n\t"
"swc1 %[tmp8f], 28(%[u_ptr]) \n\t"
"addiu %[u_ptr], %[u_ptr], 32 \n\t"
".set pop \n\t"
: [u_ptr] "+r" (u_ptr), [noisePow] "+r" (noisePow),
[noise] "=&f" (noise), [noise2] "=&f" (noise2),
[noise3] "=&f" (noise3), [noise4] "=&f" (noise4),
[tmp1f] "=&f" (tmp1f), [tmp2f] "=&f" (tmp2f),
[tmp3f] "=&f" (tmp3f), [tmp4f] "=&f" (tmp4f),
[tmp5f] "=&f" (tmp5f), [tmp6f] "=&f" (tmp6f),
[tmp7f] "=&f" (tmp7f), [tmp8f] "=&f" (tmp8f)
:
: "memory"
);
}
u[PART_LEN][1] = 0;
noisePow -= PART_LEN;
u_ptr = &u[0][0];
float* u_ptr_end = &u[PART_LEN][0];
float* efw_ptr_0 = &efw[0][0];
float* efw_ptr_1 = &efw[1][0];
float tmp9f, tmp10f;
const float tmp1c = 1.0;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"1: \n\t"
"lwc1 %[tmp1f], 0(%[lambda]) \n\t"
"lwc1 %[tmp6f], 4(%[lambda]) \n\t"
"addiu %[lambda], %[lambda], 8 \n\t"
"c.lt.s %[tmp1f], %[tmp1c] \n\t"
"bc1f 4f \n\t"
" nop \n\t"
"c.lt.s %[tmp6f], %[tmp1c] \n\t"
"bc1f 3f \n\t"
" nop \n\t"
"2: \n\t"
"mul.s %[tmp1f], %[tmp1f], %[tmp1f] \n\t"
"mul.s %[tmp6f], %[tmp6f], %[tmp6f] \n\t"
"sub.s %[tmp1f], %[tmp1c], %[tmp1f] \n\t"
"sub.s %[tmp6f], %[tmp1c], %[tmp6f] \n\t"
"sqrt.s %[tmp1f], %[tmp1f] \n\t"
"sqrt.s %[tmp6f], %[tmp6f] \n\t"
"lwc1 %[tmp2f], 0(%[efw_ptr_0]) \n\t"
"lwc1 %[tmp3f], 0(%[u_ptr]) \n\t"
"lwc1 %[tmp7f], 4(%[efw_ptr_0]) \n\t"
"lwc1 %[tmp8f], 8(%[u_ptr]) \n\t"
"lwc1 %[tmp4f], 0(%[efw_ptr_1]) \n\t"
"lwc1 %[tmp5f], 4(%[u_ptr]) \n\t"
"lwc1 %[tmp9f], 4(%[efw_ptr_1]) \n\t"
"lwc1 %[tmp10f], 12(%[u_ptr]) \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[tmp3f], %[tmp1f], %[tmp3f] \n\t"
"add.s %[tmp2f], %[tmp2f], %[tmp3f] \n\t"
"mul.s %[tmp3f], %[tmp1f], %[tmp5f] \n\t"
"add.s %[tmp4f], %[tmp4f], %[tmp3f] \n\t"
"mul.s %[tmp3f], %[tmp6f], %[tmp8f] \n\t"
"add.s %[tmp7f], %[tmp7f], %[tmp3f] \n\t"
"mul.s %[tmp3f], %[tmp6f], %[tmp10f] \n\t"
"add.s %[tmp9f], %[tmp9f], %[tmp3f] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"madd.s %[tmp2f], %[tmp2f], %[tmp1f], %[tmp3f] \n\t"
"madd.s %[tmp4f], %[tmp4f], %[tmp1f], %[tmp5f] \n\t"
"madd.s %[tmp7f], %[tmp7f], %[tmp6f], %[tmp8f] \n\t"
"madd.s %[tmp9f], %[tmp9f], %[tmp6f], %[tmp10f] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"swc1 %[tmp2f], 0(%[efw_ptr_0]) \n\t"
"swc1 %[tmp4f], 0(%[efw_ptr_1]) \n\t"
"swc1 %[tmp7f], 4(%[efw_ptr_0]) \n\t"
"b 5f \n\t"
" swc1 %[tmp9f], 4(%[efw_ptr_1]) \n\t"
"3: \n\t"
"mul.s %[tmp1f], %[tmp1f], %[tmp1f] \n\t"
"sub.s %[tmp1f], %[tmp1c], %[tmp1f] \n\t"
"sqrt.s %[tmp1f], %[tmp1f] \n\t"
"lwc1 %[tmp2f], 0(%[efw_ptr_0]) \n\t"
"lwc1 %[tmp3f], 0(%[u_ptr]) \n\t"
"lwc1 %[tmp4f], 0(%[efw_ptr_1]) \n\t"
"lwc1 %[tmp5f], 4(%[u_ptr]) \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[tmp3f], %[tmp1f], %[tmp3f] \n\t"
"add.s %[tmp2f], %[tmp2f], %[tmp3f] \n\t"
"mul.s %[tmp3f], %[tmp1f], %[tmp5f] \n\t"
"add.s %[tmp4f], %[tmp4f], %[tmp3f] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"madd.s %[tmp2f], %[tmp2f], %[tmp1f], %[tmp3f] \n\t"
"madd.s %[tmp4f], %[tmp4f], %[tmp1f], %[tmp5f] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"swc1 %[tmp2f], 0(%[efw_ptr_0]) \n\t"
"b 5f \n\t"
" swc1 %[tmp4f], 0(%[efw_ptr_1]) \n\t"
"4: \n\t"
"c.lt.s %[tmp6f], %[tmp1c] \n\t"
"bc1f 5f \n\t"
" nop \n\t"
"mul.s %[tmp6f], %[tmp6f], %[tmp6f] \n\t"
"sub.s %[tmp6f], %[tmp1c], %[tmp6f] \n\t"
"sqrt.s %[tmp6f], %[tmp6f] \n\t"
"lwc1 %[tmp7f], 4(%[efw_ptr_0]) \n\t"
"lwc1 %[tmp8f], 8(%[u_ptr]) \n\t"
"lwc1 %[tmp9f], 4(%[efw_ptr_1]) \n\t"
"lwc1 %[tmp10f], 12(%[u_ptr]) \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[tmp3f], %[tmp6f], %[tmp8f] \n\t"
"add.s %[tmp7f], %[tmp7f], %[tmp3f] \n\t"
"mul.s %[tmp3f], %[tmp6f], %[tmp10f] \n\t"
"add.s %[tmp9f], %[tmp9f], %[tmp3f] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"madd.s %[tmp7f], %[tmp7f], %[tmp6f], %[tmp8f] \n\t"
"madd.s %[tmp9f], %[tmp9f], %[tmp6f], %[tmp10f] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"swc1 %[tmp7f], 4(%[efw_ptr_0]) \n\t"
"swc1 %[tmp9f], 4(%[efw_ptr_1]) \n\t"
"5: \n\t"
"addiu %[u_ptr], %[u_ptr], 16 \n\t"
"addiu %[efw_ptr_0], %[efw_ptr_0], 8 \n\t"
"bne %[u_ptr], %[u_ptr_end], 1b \n\t"
" addiu %[efw_ptr_1], %[efw_ptr_1], 8 \n\t"
".set pop \n\t"
: [lambda] "+r" (lambda), [u_ptr] "+r" (u_ptr),
[efw_ptr_0] "+r" (efw_ptr_0), [efw_ptr_1] "+r" (efw_ptr_1),
[tmp1f] "=&f" (tmp1f), [tmp2f] "=&f" (tmp2f), [tmp3f] "=&f" (tmp3f),
[tmp4f] "=&f" (tmp4f), [tmp5f] "=&f" (tmp5f),
[tmp6f] "=&f" (tmp6f), [tmp7f] "=&f" (tmp7f), [tmp8f] "=&f" (tmp8f),
[tmp9f] "=&f" (tmp9f), [tmp10f] "=&f" (tmp10f)
: [tmp1c] "f" (tmp1c), [u_ptr_end] "r" (u_ptr_end)
: "memory"
);
lambda -= PART_LEN;
tmp = sqrtf(WEBRTC_SPL_MAX(1 - lambda[PART_LEN] * lambda[PART_LEN], 0));
//tmp = 1 - lambda[i];
efw[0][PART_LEN] += tmp * u[PART_LEN][0];
efw[1][PART_LEN] += tmp * u[PART_LEN][1];
// For H band comfort noise
// TODO: don't compute noise and "tmp" twice. Use the previous results.
noiseAvg = 0.0;
tmpAvg = 0.0;
num = 0;
if ((aec->sampFreq == 32000 || aec->sampFreq == 48000) && flagHbandCn == 1) {
for (i = 0; i < PART_LEN; i++) {
rand[i] = ((float)randW16[i]) / 32768;
}
// average noise scale
// average over second half of freq spectrum (i.e., 4->8khz)
// TODO: we shouldn't need num. We know how many elements we're summing.
for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
num++;
noiseAvg += sqrtf(noisePow[i]);
}
noiseAvg /= (float)num;
// average nlp scale
// average over second half of freq spectrum (i.e., 4->8khz)
// TODO: we shouldn't need num. We know how many elements we're summing.
num = 0;
for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
num++;
tmpAvg += sqrtf(WEBRTC_SPL_MAX(1 - lambda[i] * lambda[i], 0));
}
tmpAvg /= (float)num;
// Use average noise for H band
// TODO: we should probably have a new random vector here.
// Reject LF noise
u[0][0] = 0;
u[0][1] = 0;
for (i = 1; i < PART_LEN1; i++) {
tmp = pi2 * rand[i - 1];
// Use average noise for H band
u[i][0] = noiseAvg * (float)cos(tmp);
u[i][1] = -noiseAvg * (float)sin(tmp);
}
u[PART_LEN][1] = 0;
for (i = 0; i < PART_LEN1; i++) {
// Use average NLP weight for H band
comfortNoiseHband[i][0] = tmpAvg * u[i][0];
comfortNoiseHband[i][1] = tmpAvg * u[i][1];
}
}
}
void WebRtcAec_FilterFar_mips(AecCore* aec, float yf[2][PART_LEN1]) {
int i;
for (i = 0; i < aec->num_partitions; i++) {
int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
int pos = i * PART_LEN1;
// Check for wrap
if (i + aec->xfBufBlockPos >= aec->num_partitions) {
xPos -= aec->num_partitions * (PART_LEN1);
}
float* yf0 = yf[0];
float* yf1 = yf[1];
float* aRe = aec->xfBuf[0] + xPos;
float* aIm = aec->xfBuf[1] + xPos;
float* bRe = aec->wfBuf[0] + pos;
float* bIm = aec->wfBuf[1] + pos;
float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13;
int len = PART_LEN1 >> 1;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"1: \n\t"
"lwc1 %[f0], 0(%[aRe]) \n\t"
"lwc1 %[f1], 0(%[bRe]) \n\t"
"lwc1 %[f2], 0(%[bIm]) \n\t"
"lwc1 %[f3], 0(%[aIm]) \n\t"
"lwc1 %[f4], 4(%[aRe]) \n\t"
"lwc1 %[f5], 4(%[bRe]) \n\t"
"lwc1 %[f6], 4(%[bIm]) \n\t"
"mul.s %[f8], %[f0], %[f1] \n\t"
"mul.s %[f0], %[f0], %[f2] \n\t"
"mul.s %[f9], %[f4], %[f5] \n\t"
"mul.s %[f4], %[f4], %[f6] \n\t"
"lwc1 %[f7], 4(%[aIm]) \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[f12], %[f2], %[f3] \n\t"
"mul.s %[f1], %[f3], %[f1] \n\t"
"mul.s %[f11], %[f6], %[f7] \n\t"
"addiu %[aRe], %[aRe], 8 \n\t"
"addiu %[aIm], %[aIm], 8 \n\t"
"addiu %[len], %[len], -1 \n\t"
"sub.s %[f8], %[f8], %[f12] \n\t"
"mul.s %[f12], %[f7], %[f5] \n\t"
"lwc1 %[f2], 0(%[yf0]) \n\t"
"add.s %[f1], %[f0], %[f1] \n\t"
"lwc1 %[f3], 0(%[yf1]) \n\t"
"sub.s %[f9], %[f9], %[f11] \n\t"
"lwc1 %[f6], 4(%[yf0]) \n\t"
"add.s %[f4], %[f4], %[f12] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"addiu %[aRe], %[aRe], 8 \n\t"
"addiu %[aIm], %[aIm], 8 \n\t"
"addiu %[len], %[len], -1 \n\t"
"nmsub.s %[f8], %[f8], %[f2], %[f3] \n\t"
"lwc1 %[f2], 0(%[yf0]) \n\t"
"madd.s %[f1], %[f0], %[f3], %[f1] \n\t"
"lwc1 %[f3], 0(%[yf1]) \n\t"
"nmsub.s %[f9], %[f9], %[f6], %[f7] \n\t"
"lwc1 %[f6], 4(%[yf0]) \n\t"
"madd.s %[f4], %[f4], %[f7], %[f5] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"lwc1 %[f5], 4(%[yf1]) \n\t"
"add.s %[f2], %[f2], %[f8] \n\t"
"addiu %[bRe], %[bRe], 8 \n\t"
"addiu %[bIm], %[bIm], 8 \n\t"
"add.s %[f3], %[f3], %[f1] \n\t"
"add.s %[f6], %[f6], %[f9] \n\t"
"add.s %[f5], %[f5], %[f4] \n\t"
"swc1 %[f2], 0(%[yf0]) \n\t"
"swc1 %[f3], 0(%[yf1]) \n\t"
"swc1 %[f6], 4(%[yf0]) \n\t"
"swc1 %[f5], 4(%[yf1]) \n\t"
"addiu %[yf0], %[yf0], 8 \n\t"
"bgtz %[len], 1b \n\t"
" addiu %[yf1], %[yf1], 8 \n\t"
"lwc1 %[f0], 0(%[aRe]) \n\t"
"lwc1 %[f1], 0(%[bRe]) \n\t"
"lwc1 %[f2], 0(%[bIm]) \n\t"
"lwc1 %[f3], 0(%[aIm]) \n\t"
"mul.s %[f8], %[f0], %[f1] \n\t"
"mul.s %[f0], %[f0], %[f2] \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[f12], %[f2], %[f3] \n\t"
"mul.s %[f1], %[f3], %[f1] \n\t"
"sub.s %[f8], %[f8], %[f12] \n\t"
"lwc1 %[f2], 0(%[yf0]) \n\t"
"add.s %[f1], %[f0], %[f1] \n\t"
"lwc1 %[f3], 0(%[yf1]) \n\t"
#else // #if !defined(MIPS32_R2_LE)
"nmsub.s %[f8], %[f8], %[f2], %[f3] \n\t"
"lwc1 %[f2], 0(%[yf0]) \n\t"
"madd.s %[f1], %[f0], %[f3], %[f1] \n\t"
"lwc1 %[f3], 0(%[yf1]) \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"add.s %[f2], %[f2], %[f8] \n\t"
"add.s %[f3], %[f3], %[f1] \n\t"
"swc1 %[f2], 0(%[yf0]) \n\t"
"swc1 %[f3], 0(%[yf1]) \n\t"
".set pop \n\t"
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
[f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
[f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
[f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
[f12] "=&f" (f12), [f13] "=&f" (f13), [aRe] "+r" (aRe),
[aIm] "+r" (aIm), [bRe] "+r" (bRe), [bIm] "+r" (bIm),
[yf0] "+r" (yf0), [yf1] "+r" (yf1), [len] "+r" (len)
:
: "memory"
);
}
}
void WebRtcAec_FilterAdaptation_mips(AecCore* aec,
float* fft,
float ef[2][PART_LEN1]) {
int i;
for (i = 0; i < aec->num_partitions; i++) {
int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
int pos;
// Check for wrap
if (i + aec->xfBufBlockPos >= aec->num_partitions) {
xPos -= aec->num_partitions * PART_LEN1;
}
pos = i * PART_LEN1;
float* aRe = aec->xfBuf[0] + xPos;
float* aIm = aec->xfBuf[1] + xPos;
float* bRe = ef[0];
float* bIm = ef[1];
float* fft_tmp;
float f0, f1, f2, f3, f4, f5, f6 ,f7, f8, f9, f10, f11, f12;
int len = PART_LEN >> 1;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"addiu %[fft_tmp], %[fft], 0 \n\t"
"1: \n\t"
"lwc1 %[f0], 0(%[aRe]) \n\t"
"lwc1 %[f1], 0(%[bRe]) \n\t"
"lwc1 %[f2], 0(%[bIm]) \n\t"
"lwc1 %[f4], 4(%[aRe]) \n\t"
"lwc1 %[f5], 4(%[bRe]) \n\t"
"lwc1 %[f6], 4(%[bIm]) \n\t"
"addiu %[aRe], %[aRe], 8 \n\t"
"addiu %[bRe], %[bRe], 8 \n\t"
"mul.s %[f8], %[f0], %[f1] \n\t"
"mul.s %[f0], %[f0], %[f2] \n\t"
"lwc1 %[f3], 0(%[aIm]) \n\t"
"mul.s %[f9], %[f4], %[f5] \n\t"
"lwc1 %[f7], 4(%[aIm]) \n\t"
"mul.s %[f4], %[f4], %[f6] \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[f10], %[f3], %[f2] \n\t"
"mul.s %[f1], %[f3], %[f1] \n\t"
"mul.s %[f11], %[f7], %[f6] \n\t"
"mul.s %[f5], %[f7], %[f5] \n\t"
"addiu %[aIm], %[aIm], 8 \n\t"
"addiu %[bIm], %[bIm], 8 \n\t"
"addiu %[len], %[len], -1 \n\t"
"add.s %[f8], %[f8], %[f10] \n\t"
"sub.s %[f1], %[f0], %[f1] \n\t"
"add.s %[f9], %[f9], %[f11] \n\t"
"sub.s %[f5], %[f4], %[f5] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"addiu %[aIm], %[aIm], 8 \n\t"
"addiu %[bIm], %[bIm], 8 \n\t"
"addiu %[len], %[len], -1 \n\t"
"madd.s %[f8], %[f8], %[f3], %[f2] \n\t"
"nmsub.s %[f1], %[f0], %[f3], %[f1] \n\t"
"madd.s %[f9], %[f9], %[f7], %[f6] \n\t"
"nmsub.s %[f5], %[f4], %[f7], %[f5] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"swc1 %[f8], 0(%[fft_tmp]) \n\t"
"swc1 %[f1], 4(%[fft_tmp]) \n\t"
"swc1 %[f9], 8(%[fft_tmp]) \n\t"
"swc1 %[f5], 12(%[fft_tmp]) \n\t"
"bgtz %[len], 1b \n\t"
" addiu %[fft_tmp], %[fft_tmp], 16 \n\t"
"lwc1 %[f0], 0(%[aRe]) \n\t"
"lwc1 %[f1], 0(%[bRe]) \n\t"
"lwc1 %[f2], 0(%[bIm]) \n\t"
"lwc1 %[f3], 0(%[aIm]) \n\t"
"mul.s %[f8], %[f0], %[f1] \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[f10], %[f3], %[f2] \n\t"
"add.s %[f8], %[f8], %[f10] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"madd.s %[f8], %[f8], %[f3], %[f2] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"swc1 %[f8], 4(%[fft]) \n\t"
".set pop \n\t"
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
[f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
[f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
[f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
[f12] "=&f" (f12), [aRe] "+r" (aRe), [aIm] "+r" (aIm),
[bRe] "+r" (bRe), [bIm] "+r" (bIm), [fft_tmp] "=&r" (fft_tmp),
[len] "+r" (len)
: [fft] "r" (fft)
: "memory"
);
aec_rdft_inverse_128(fft);
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
// fft scaling
{
float scale = 2.0f / PART_LEN2;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"addiu %[fft_tmp], %[fft], 0 \n\t"
"addiu %[len], $zero, 8 \n\t"
"1: \n\t"
"addiu %[len], %[len], -1 \n\t"
"lwc1 %[f0], 0(%[fft_tmp]) \n\t"
"lwc1 %[f1], 4(%[fft_tmp]) \n\t"
"lwc1 %[f2], 8(%[fft_tmp]) \n\t"
"lwc1 %[f3], 12(%[fft_tmp]) \n\t"
"mul.s %[f0], %[f0], %[scale] \n\t"
"mul.s %[f1], %[f1], %[scale] \n\t"
"mul.s %[f2], %[f2], %[scale] \n\t"
"mul.s %[f3], %[f3], %[scale] \n\t"
"lwc1 %[f4], 16(%[fft_tmp]) \n\t"
"lwc1 %[f5], 20(%[fft_tmp]) \n\t"
"lwc1 %[f6], 24(%[fft_tmp]) \n\t"
"lwc1 %[f7], 28(%[fft_tmp]) \n\t"
"mul.s %[f4], %[f4], %[scale] \n\t"
"mul.s %[f5], %[f5], %[scale] \n\t"
"mul.s %[f6], %[f6], %[scale] \n\t"
"mul.s %[f7], %[f7], %[scale] \n\t"
"swc1 %[f0], 0(%[fft_tmp]) \n\t"
"swc1 %[f1], 4(%[fft_tmp]) \n\t"
"swc1 %[f2], 8(%[fft_tmp]) \n\t"
"swc1 %[f3], 12(%[fft_tmp]) \n\t"
"swc1 %[f4], 16(%[fft_tmp]) \n\t"
"swc1 %[f5], 20(%[fft_tmp]) \n\t"
"swc1 %[f6], 24(%[fft_tmp]) \n\t"
"swc1 %[f7], 28(%[fft_tmp]) \n\t"
"bgtz %[len], 1b \n\t"
" addiu %[fft_tmp], %[fft_tmp], 32 \n\t"
".set pop \n\t"
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
[f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
[f6] "=&f" (f6), [f7] "=&f" (f7), [len] "=&r" (len),
[fft_tmp] "=&r" (fft_tmp)
: [scale] "f" (scale), [fft] "r" (fft)
: "memory"
);
}
aec_rdft_forward_128(fft);
aRe = aec->wfBuf[0] + pos;
aIm = aec->wfBuf[1] + pos;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"addiu %[fft_tmp], %[fft], 0 \n\t"
"addiu %[len], $zero, 31 \n\t"
"lwc1 %[f0], 0(%[aRe]) \n\t"
"lwc1 %[f1], 0(%[fft_tmp]) \n\t"
"lwc1 %[f2], 256(%[aRe]) \n\t"
"lwc1 %[f3], 4(%[fft_tmp]) \n\t"
"lwc1 %[f4], 4(%[aRe]) \n\t"
"lwc1 %[f5], 8(%[fft_tmp]) \n\t"
"lwc1 %[f6], 4(%[aIm]) \n\t"
"lwc1 %[f7], 12(%[fft_tmp]) \n\t"
"add.s %[f0], %[f0], %[f1] \n\t"
"add.s %[f2], %[f2], %[f3] \n\t"
"add.s %[f4], %[f4], %[f5] \n\t"
"add.s %[f6], %[f6], %[f7] \n\t"
"addiu %[fft_tmp], %[fft_tmp], 16 \n\t"
"swc1 %[f0], 0(%[aRe]) \n\t"
"swc1 %[f2], 256(%[aRe]) \n\t"
"swc1 %[f4], 4(%[aRe]) \n\t"
"addiu %[aRe], %[aRe], 8 \n\t"
"swc1 %[f6], 4(%[aIm]) \n\t"
"addiu %[aIm], %[aIm], 8 \n\t"
"1: \n\t"
"lwc1 %[f0], 0(%[aRe]) \n\t"
"lwc1 %[f1], 0(%[fft_tmp]) \n\t"
"lwc1 %[f2], 0(%[aIm]) \n\t"
"lwc1 %[f3], 4(%[fft_tmp]) \n\t"
"lwc1 %[f4], 4(%[aRe]) \n\t"
"lwc1 %[f5], 8(%[fft_tmp]) \n\t"
"lwc1 %[f6], 4(%[aIm]) \n\t"
"lwc1 %[f7], 12(%[fft_tmp]) \n\t"
"add.s %[f0], %[f0], %[f1] \n\t"
"add.s %[f2], %[f2], %[f3] \n\t"
"add.s %[f4], %[f4], %[f5] \n\t"
"add.s %[f6], %[f6], %[f7] \n\t"
"addiu %[len], %[len], -1 \n\t"
"addiu %[fft_tmp], %[fft_tmp], 16 \n\t"
"swc1 %[f0], 0(%[aRe]) \n\t"
"swc1 %[f2], 0(%[aIm]) \n\t"
"swc1 %[f4], 4(%[aRe]) \n\t"
"addiu %[aRe], %[aRe], 8 \n\t"
"swc1 %[f6], 4(%[aIm]) \n\t"
"bgtz %[len], 1b \n\t"
" addiu %[aIm], %[aIm], 8 \n\t"
".set pop \n\t"
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
[f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
[f6] "=&f" (f6), [f7] "=&f" (f7), [len] "=&r" (len),
[fft_tmp] "=&r" (fft_tmp), [aRe] "+r" (aRe), [aIm] "+r" (aIm)
: [fft] "r" (fft)
: "memory"
);
}
}
void WebRtcAec_OverdriveAndSuppress_mips(AecCore* aec,
float hNl[PART_LEN1],
const float hNlFb,
float efw[2][PART_LEN1]) {
int i;
const float one = 1.0;
float* p_hNl;
float* p_efw0;
float* p_efw1;
float* p_WebRtcAec_wC;
float temp1, temp2, temp3, temp4;
p_hNl = &hNl[0];
p_efw0 = &efw[0][0];
p_efw1 = &efw[1][0];
p_WebRtcAec_wC = (float*)&WebRtcAec_weightCurve[0];
for (i = 0; i < PART_LEN1; i++) {
// Weight subbands
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"lwc1 %[temp1], 0(%[p_hNl]) \n\t"
"lwc1 %[temp2], 0(%[p_wC]) \n\t"
"c.lt.s %[hNlFb], %[temp1] \n\t"
"bc1f 1f \n\t"
" mul.s %[temp3], %[temp2], %[hNlFb] \n\t"
"sub.s %[temp4], %[one], %[temp2] \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[temp1], %[temp1], %[temp4] \n\t"
"add.s %[temp1], %[temp3], %[temp1] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"madd.s %[temp1], %[temp3], %[temp1], %[temp4] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"swc1 %[temp1], 0(%[p_hNl]) \n\t"
"1: \n\t"
"addiu %[p_wC], %[p_wC], 4 \n\t"
".set pop \n\t"
: [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), [temp3] "=&f" (temp3),
[temp4] "=&f" (temp4), [p_wC] "+r" (p_WebRtcAec_wC)
: [hNlFb] "f" (hNlFb), [one] "f" (one), [p_hNl] "r" (p_hNl)
: "memory"
);
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
__asm __volatile (
"lwc1 %[temp1], 0(%[p_hNl]) \n\t"
"lwc1 %[temp3], 0(%[p_efw1]) \n\t"
"lwc1 %[temp2], 0(%[p_efw0]) \n\t"
"addiu %[p_hNl], %[p_hNl], 4 \n\t"
"mul.s %[temp3], %[temp3], %[temp1] \n\t"
"mul.s %[temp2], %[temp2], %[temp1] \n\t"
"addiu %[p_efw0], %[p_efw0], 4 \n\t"
"addiu %[p_efw1], %[p_efw1], 4 \n\t"
"neg.s %[temp4], %[temp3] \n\t"
"swc1 %[temp2], -4(%[p_efw0]) \n\t"
"swc1 %[temp4], -4(%[p_efw1]) \n\t"
: [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), [temp3] "=&f" (temp3),
[temp4] "=&f" (temp4), [p_efw0] "+r" (p_efw0), [p_efw1] "+r" (p_efw1),
[p_hNl] "+r" (p_hNl)
:
: "memory"
);
}
}
void WebRtcAec_ScaleErrorSignal_mips(AecCore* aec, float ef[2][PART_LEN1]) {
const float mu = aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
const float error_threshold = aec->extended_filter_enabled
? kExtendedErrorThreshold
: aec->normal_error_threshold;
int len = (PART_LEN1);
float* ef0 = ef[0];
float* ef1 = ef[1];
float* xPow = aec->xPow;
float fac1 = 1e-10f;
float err_th2 = error_threshold * error_threshold;
float f0, f1, f2;
#if !defined(MIPS32_R2_LE)
float f3;
#endif
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"1: \n\t"
"lwc1 %[f0], 0(%[xPow]) \n\t"
"lwc1 %[f1], 0(%[ef0]) \n\t"
"lwc1 %[f2], 0(%[ef1]) \n\t"
"add.s %[f0], %[f0], %[fac1] \n\t"
"div.s %[f1], %[f1], %[f0] \n\t"
"div.s %[f2], %[f2], %[f0] \n\t"
"mul.s %[f0], %[f1], %[f1] \n\t"
#if defined(MIPS32_R2_LE)
"madd.s %[f0], %[f0], %[f2], %[f2] \n\t"
#else
"mul.s %[f3], %[f2], %[f2] \n\t"
"add.s %[f0], %[f0], %[f3] \n\t"
#endif
"c.le.s %[f0], %[err_th2] \n\t"
"nop \n\t"
"bc1t 2f \n\t"
" nop \n\t"
"sqrt.s %[f0], %[f0] \n\t"
"add.s %[f0], %[f0], %[fac1] \n\t"
"div.s %[f0], %[err_th], %[f0] \n\t"
"mul.s %[f1], %[f1], %[f0] \n\t"
"mul.s %[f2], %[f2], %[f0] \n\t"
"2: \n\t"
"mul.s %[f1], %[f1], %[mu] \n\t"
"mul.s %[f2], %[f2], %[mu] \n\t"
"swc1 %[f1], 0(%[ef0]) \n\t"
"swc1 %[f2], 0(%[ef1]) \n\t"
"addiu %[len], %[len], -1 \n\t"
"addiu %[xPow], %[xPow], 4 \n\t"
"addiu %[ef0], %[ef0], 4 \n\t"
"bgtz %[len], 1b \n\t"
" addiu %[ef1], %[ef1], 4 \n\t"
".set pop \n\t"
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
#if !defined(MIPS32_R2_LE)
[f3] "=&f" (f3),
#endif
[xPow] "+r" (xPow), [ef0] "+r" (ef0), [ef1] "+r" (ef1),
[len] "+r" (len)
: [fac1] "f" (fac1), [err_th2] "f" (err_th2), [mu] "f" (mu),
[err_th] "f" (error_threshold)
: "memory"
);
}
void WebRtcAec_InitAec_mips(void) {
WebRtcAec_FilterFar = WebRtcAec_FilterFar_mips;
WebRtcAec_FilterAdaptation = WebRtcAec_FilterAdaptation_mips;
WebRtcAec_ScaleErrorSignal = WebRtcAec_ScaleErrorSignal_mips;
WebRtcAec_ComfortNoise = WebRtcAec_ComfortNoise_mips;
WebRtcAec_OverdriveAndSuppress = WebRtcAec_OverdriveAndSuppress_mips;
}

View File

@ -0,0 +1,736 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* The core AEC algorithm, neon version of speed-critical functions.
*
* Based on aec_core_sse2.c.
*/
#include <arm_neon.h>
#include <math.h>
#include <string.h> // memset
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
#include "webrtc/modules/audio_processing/aec/aec_common.h"
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
enum { kShiftExponentIntoTopMantissa = 8 };
enum { kFloatExponentShift = 23 };
__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
return aRe * bRe - aIm * bIm;
}
__inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
return aRe * bIm + aIm * bRe;
}
static void FilterFarNEON(AecCore* aec, float yf[2][PART_LEN1]) {
int i;
const int num_partitions = aec->num_partitions;
for (i = 0; i < num_partitions; i++) {
int j;
int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
int pos = i * PART_LEN1;
// Check for wrap
if (i + aec->xfBufBlockPos >= num_partitions) {
xPos -= num_partitions * PART_LEN1;
}
// vectorized code (four at once)
for (j = 0; j + 3 < PART_LEN1; j += 4) {
const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);
const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);
const float32x4_t wfBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);
const float32x4_t wfBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);
const float32x4_t yf_re = vld1q_f32(&yf[0][j]);
const float32x4_t yf_im = vld1q_f32(&yf[1][j]);
const float32x4_t a = vmulq_f32(xfBuf_re, wfBuf_re);
const float32x4_t e = vmlsq_f32(a, xfBuf_im, wfBuf_im);
const float32x4_t c = vmulq_f32(xfBuf_re, wfBuf_im);
const float32x4_t f = vmlaq_f32(c, xfBuf_im, wfBuf_re);
const float32x4_t g = vaddq_f32(yf_re, e);
const float32x4_t h = vaddq_f32(yf_im, f);
vst1q_f32(&yf[0][j], g);
vst1q_f32(&yf[1][j], h);
}
// scalar code for the remaining items.
for (; j < PART_LEN1; j++) {
yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],
aec->xfBuf[1][xPos + j],
aec->wfBuf[0][pos + j],
aec->wfBuf[1][pos + j]);
yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],
aec->xfBuf[1][xPos + j],
aec->wfBuf[0][pos + j],
aec->wfBuf[1][pos + j]);
}
}
}
// ARM64's arm_neon.h has already defined vdivq_f32 vsqrtq_f32.
#if !defined (WEBRTC_ARCH_ARM64)
static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) {
int i;
float32x4_t x = vrecpeq_f32(b);
// from arm documentation
// The Newton-Raphson iteration:
// x[n+1] = x[n] * (2 - d * x[n])
// converges to (1/d) if x0 is the result of VRECPE applied to d.
//
// Note: The precision did not improve after 2 iterations.
for (i = 0; i < 2; i++) {
x = vmulq_f32(vrecpsq_f32(b, x), x);
}
// a/b = a*(1/b)
return vmulq_f32(a, x);
}
static float32x4_t vsqrtq_f32(float32x4_t s) {
int i;
float32x4_t x = vrsqrteq_f32(s);
// Code to handle sqrt(0).
// If the input to sqrtf() is zero, a zero will be returned.
// If the input to vrsqrteq_f32() is zero, positive infinity is returned.
const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);
// check for divide by zero
const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x));
// zero out the positive infinity results
x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero),
vreinterpretq_u32_f32(x)));
// from arm documentation
// The Newton-Raphson iteration:
// x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2)
// converges to (1/√d) if x0 is the result of VRSQRTE applied to d.
//
// Note: The precision did not improve after 2 iterations.
for (i = 0; i < 2; i++) {
x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x);
}
// sqrt(s) = s * 1/sqrt(s)
return vmulq_f32(s, x);;
}
#endif // WEBRTC_ARCH_ARM64
static void ScaleErrorSignalNEON(AecCore* aec, float ef[2][PART_LEN1]) {
const float mu = aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
const float error_threshold = aec->extended_filter_enabled ?
kExtendedErrorThreshold : aec->normal_error_threshold;
const float32x4_t k1e_10f = vdupq_n_f32(1e-10f);
const float32x4_t kMu = vmovq_n_f32(mu);
const float32x4_t kThresh = vmovq_n_f32(error_threshold);
int i;
// vectorized code (four at once)
for (i = 0; i + 3 < PART_LEN1; i += 4) {
const float32x4_t xPow = vld1q_f32(&aec->xPow[i]);
const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]);
const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]);
const float32x4_t xPowPlus = vaddq_f32(xPow, k1e_10f);
float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus);
float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus);
const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re);
const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im);
const float32x4_t absEf = vsqrtq_f32(ef_sum2);
const uint32x4_t bigger = vcgtq_f32(absEf, kThresh);
const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f);
const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus);
uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv));
uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv));
uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger),
vreinterpretq_u32_f32(ef_re));
uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger),
vreinterpretq_u32_f32(ef_im));
ef_re_if = vandq_u32(bigger, ef_re_if);
ef_im_if = vandq_u32(bigger, ef_im_if);
ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if);
ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if);
ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu);
ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu);
vst1q_f32(&ef[0][i], ef_re);
vst1q_f32(&ef[1][i], ef_im);
}
// scalar code for the remaining items.
for (; i < PART_LEN1; i++) {
float abs_ef;
ef[0][i] /= (aec->xPow[i] + 1e-10f);
ef[1][i] /= (aec->xPow[i] + 1e-10f);
abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
if (abs_ef > error_threshold) {
abs_ef = error_threshold / (abs_ef + 1e-10f);
ef[0][i] *= abs_ef;
ef[1][i] *= abs_ef;
}
// Stepsize factor
ef[0][i] *= mu;
ef[1][i] *= mu;
}
}
static void FilterAdaptationNEON(AecCore* aec,
float* fft,
float ef[2][PART_LEN1]) {
int i;
const int num_partitions = aec->num_partitions;
for (i = 0; i < num_partitions; i++) {
int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
int pos = i * PART_LEN1;
int j;
// Check for wrap
if (i + aec->xfBufBlockPos >= num_partitions) {
xPos -= num_partitions * PART_LEN1;
}
// Process the whole array...
for (j = 0; j < PART_LEN; j += 4) {
// Load xfBuf and ef.
const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);
const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);
const float32x4_t ef_re = vld1q_f32(&ef[0][j]);
const float32x4_t ef_im = vld1q_f32(&ef[1][j]);
// Calculate the product of conjugate(xfBuf) by ef.
// re(conjugate(a) * b) = aRe * bRe + aIm * bIm
// im(conjugate(a) * b)= aRe * bIm - aIm * bRe
const float32x4_t a = vmulq_f32(xfBuf_re, ef_re);
const float32x4_t e = vmlaq_f32(a, xfBuf_im, ef_im);
const float32x4_t c = vmulq_f32(xfBuf_re, ef_im);
const float32x4_t f = vmlsq_f32(c, xfBuf_im, ef_re);
// Interleave real and imaginary parts.
const float32x4x2_t g_n_h = vzipq_f32(e, f);
// Store
vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]);
vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);
}
// ... and fixup the first imaginary entry.
fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
-aec->xfBuf[1][xPos + PART_LEN],
ef[0][PART_LEN],
ef[1][PART_LEN]);
aec_rdft_inverse_128(fft);
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
// fft scaling
{
const float scale = 2.0f / PART_LEN2;
const float32x4_t scale_ps = vmovq_n_f32(scale);
for (j = 0; j < PART_LEN; j += 4) {
const float32x4_t fft_ps = vld1q_f32(&fft[j]);
const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps);
vst1q_f32(&fft[j], fft_scale);
}
}
aec_rdft_forward_128(fft);
{
const float wt1 = aec->wfBuf[1][pos];
aec->wfBuf[0][pos + PART_LEN] += fft[1];
for (j = 0; j < PART_LEN; j += 4) {
float32x4_t wtBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);
float32x4_t wtBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);
const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]);
const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]);
const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4);
wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]);
wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]);
vst1q_f32(&aec->wfBuf[0][pos + j], wtBuf_re);
vst1q_f32(&aec->wfBuf[1][pos + j], wtBuf_im);
}
aec->wfBuf[1][pos] = wt1;
}
}
}
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
// a^b = exp2(b * log2(a))
// exp2(x) and log2(x) are calculated using polynomial approximations.
float32x4_t log2_a, b_log2_a, a_exp_b;
// Calculate log2(x), x = a.
{
// To calculate log2(x), we decompose x like this:
// x = y * 2^n
// n is an integer
// y is in the [1.0, 2.0) range
//
// log2(x) = log2(y) + n
// n can be evaluated by playing with float representation.
// log2(y) in a small range can be approximated, this code uses an order
// five polynomial approximation. The coefficients have been
// estimated with the Remez algorithm and the resulting
// polynomial has a maximum relative error of 0.00086%.
// Compute n.
// This is done by masking the exponent, shifting it into the top bit of
// the mantissa, putting eight into the biased exponent (to shift/
// compensate the fact that the exponent has been shifted in the top/
// fractional part and finally getting rid of the implicit leading one
// from the mantissa by substracting it out.
const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000);
const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000);
const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000);
const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a),
vec_float_exponent_mask);
const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa);
const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent);
const float32x4_t n =
vsubq_f32(vreinterpretq_f32_u32(n_0),
vreinterpretq_f32_u32(vec_implicit_leading_one));
// Compute y.
const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF);
const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000);
const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a),
vec_mantissa_mask);
const float32x4_t y =
vreinterpretq_f32_u32(vorrq_u32(mantissa,
vec_zero_biased_exponent_is_one));
// Approximate log2(y) ~= (y - 1) * pol5(y).
// pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f);
const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f);
const float32x4_t C3 = vdupq_n_f32(-1.2315303f);
const float32x4_t C2 = vdupq_n_f32(2.5988452f);
const float32x4_t C1 = vdupq_n_f32(-3.3241990f);
const float32x4_t C0 = vdupq_n_f32(3.1157899f);
float32x4_t pol5_y = C5;
pol5_y = vmlaq_f32(C4, y, pol5_y);
pol5_y = vmlaq_f32(C3, y, pol5_y);
pol5_y = vmlaq_f32(C2, y, pol5_y);
pol5_y = vmlaq_f32(C1, y, pol5_y);
pol5_y = vmlaq_f32(C0, y, pol5_y);
const float32x4_t y_minus_one =
vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one));
const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y);
// Combine parts.
log2_a = vaddq_f32(n, log2_y);
}
// b * log2(a)
b_log2_a = vmulq_f32(b, log2_a);
// Calculate exp2(x), x = b * log2(a).
{
// To calculate 2^x, we decompose x like this:
// x = n + y
// n is an integer, the value of x - 0.5 rounded down, therefore
// y is in the [0.5, 1.5) range
//
// 2^x = 2^n * 2^y
// 2^n can be evaluated by playing with float representation.
// 2^y in a small range can be approximated, this code uses an order two
// polynomial approximation. The coefficients have been estimated
// with the Remez algorithm and the resulting polynomial has a
// maximum relative error of 0.17%.
// To avoid over/underflow, we reduce the range of input to ]-127, 129].
const float32x4_t max_input = vdupq_n_f32(129.f);
const float32x4_t min_input = vdupq_n_f32(-126.99999f);
const float32x4_t x_min = vminq_f32(b_log2_a, max_input);
const float32x4_t x_max = vmaxq_f32(x_min, min_input);
// Compute n.
const float32x4_t half = vdupq_n_f32(0.5f);
const float32x4_t x_minus_half = vsubq_f32(x_max, half);
const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half);
// Compute 2^n.
const int32x4_t float_exponent_bias = vdupq_n_s32(127);
const int32x4_t two_n_exponent =
vaddq_s32(x_minus_half_floor, float_exponent_bias);
const float32x4_t two_n =
vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift));
// Compute y.
const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor));
// Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f);
const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f);
const float32x4_t C0 = vdupq_n_f32(1.0017247f);
float32x4_t exp2_y = C2;
exp2_y = vmlaq_f32(C1, y, exp2_y);
exp2_y = vmlaq_f32(C0, y, exp2_y);
// Combine parts.
a_exp_b = vmulq_f32(exp2_y, two_n);
}
return a_exp_b;
}
static void OverdriveAndSuppressNEON(AecCore* aec,
float hNl[PART_LEN1],
const float hNlFb,
float efw[2][PART_LEN1]) {
int i;
const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb);
const float32x4_t vec_one = vdupq_n_f32(1.0f);
const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f);
const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm);
// vectorized code (four at once)
for (i = 0; i + 3 < PART_LEN1; i += 4) {
// Weight subbands
float32x4_t vec_hNl = vld1q_f32(&hNl[i]);
const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]);
const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb);
const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve,
vec_hNlFb);
const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve);
const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve,
vec_hNl);
const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger),
vreinterpretq_u32_f32(vec_hNl));
const float32x4_t vec_one_weightCurve_add =
vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl);
const uint32x4_t vec_if1 =
vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add));
vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1));
{
const float32x4_t vec_overDriveCurve =
vld1q_f32(&WebRtcAec_overDriveCurve[i]);
const float32x4_t vec_overDriveSm_overDriveCurve =
vmulq_f32(vec_overDriveSm, vec_overDriveCurve);
vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve);
vst1q_f32(&hNl[i], vec_hNl);
}
// Suppress error signal
{
float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]);
float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]);
vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl);
vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl);
// Ooura fft returns incorrect sign on imaginary component. It matters
// here because we are making an additive change with comfort noise.
vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one);
vst1q_f32(&efw[0][i], vec_efw_re);
vst1q_f32(&efw[1][i], vec_efw_im);
}
}
// scalar code for the remaining items.
for (; i < PART_LEN1; i++) {
// Weight subbands
if (hNl[i] > hNlFb) {
hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
(1 - WebRtcAec_weightCurve[i]) * hNl[i];
}
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
// Suppress error signal
efw[0][i] *= hNl[i];
efw[1][i] *= hNl[i];
// Ooura fft returns incorrect sign on imaginary component. It matters
// here because we are making an additive change with comfort noise.
efw[1][i] *= -1;
}
}
static int PartitionDelay(const AecCore* aec) {
// Measures the energy in each filter partition and returns the partition with
// highest energy.
// TODO(bjornv): Spread computational cost by computing one partition per
// block?
float wfEnMax = 0;
int i;
int delay = 0;
for (i = 0; i < aec->num_partitions; i++) {
int j;
int pos = i * PART_LEN1;
float wfEn = 0;
float32x4_t vec_wfEn = vdupq_n_f32(0.0f);
// vectorized code (four at once)
for (j = 0; j + 3 < PART_LEN1; j += 4) {
const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]);
const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]);
vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0);
vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1);
}
{
float32x2_t vec_total;
// A B C D
vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn));
// A+B C+D
vec_total = vpadd_f32(vec_total, vec_total);
// A+B+C+D A+B+C+D
wfEn = vget_lane_f32(vec_total, 0);
}
// scalar code for the remaining items.
for (; j < PART_LEN1; j++) {
wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
}
if (wfEn > wfEnMax) {
wfEnMax = wfEn;
delay = i;
}
}
return delay;
}
// Updates the following smoothed Power Spectral Densities (PSD):
// - sd : near-end
// - se : residual echo
// - sx : far-end
// - sde : cross-PSD of near-end and residual echo
// - sxd : cross-PSD of near-end and far-end
//
// In addition to updating the PSDs, also the filter diverge state is determined
// upon actions are taken.
static void SmoothedPSD(AecCore* aec,
float efw[2][PART_LEN1],
float dfw[2][PART_LEN1],
float xfw[2][PART_LEN1]) {
// Power estimate smoothing coefficients.
const float* ptrGCoh = aec->extended_filter_enabled
? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
: WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
int i;
float sdSum = 0, seSum = 0;
const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD);
float32x4_t vec_sdSum = vdupq_n_f32(0.0f);
float32x4_t vec_seSum = vdupq_n_f32(0.0f);
for (i = 0; i + 3 < PART_LEN1; i += 4) {
const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]);
const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]);
const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]);
const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]);
const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]);
const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]);
float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]);
float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]);
float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]);
float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0);
float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0);
float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0);
vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1);
vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1);
vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1);
vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15);
vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]);
vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]);
vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]);
vst1q_f32(&aec->sd[i], vec_sd);
vst1q_f32(&aec->se[i], vec_se);
vst1q_f32(&aec->sx[i], vec_sx);
{
float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0);
float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1);
vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]);
vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]);
vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1);
vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0);
vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]);
vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]);
vst2q_f32(&aec->sde[i][0], vec_sde);
}
{
float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0);
float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1);
vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]);
vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]);
vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1);
vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0);
vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]);
vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]);
vst2q_f32(&aec->sxd[i][0], vec_sxd);
}
vec_sdSum = vaddq_f32(vec_sdSum, vec_sd);
vec_seSum = vaddq_f32(vec_seSum, vec_se);
}
{
float32x2_t vec_sdSum_total;
float32x2_t vec_seSum_total;
// A B C D
vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum),
vget_high_f32(vec_sdSum));
vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum),
vget_high_f32(vec_seSum));
// A+B C+D
vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total);
vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total);
// A+B+C+D A+B+C+D
sdSum = vget_lane_f32(vec_sdSum_total, 0);
seSum = vget_lane_f32(vec_seSum_total, 0);
}
// scalar code for the remaining items.
for (; i < PART_LEN1; i++) {
aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
aec->se[i] = ptrGCoh[0] * aec->se[i] +
ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
// We threshold here to protect against the ill-effects of a zero farend.
// The threshold is not arbitrarily chosen, but balances protection and
// adverse interaction with the algorithm's tuning.
// TODO(bjornv): investigate further why this is so sensitive.
aec->sx[i] =
ptrGCoh[0] * aec->sx[i] +
ptrGCoh[1] * WEBRTC_SPL_MAX(
xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
WebRtcAec_kMinFarendPSD);
aec->sde[i][0] =
ptrGCoh[0] * aec->sde[i][0] +
ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
aec->sde[i][1] =
ptrGCoh[0] * aec->sde[i][1] +
ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
aec->sxd[i][0] =
ptrGCoh[0] * aec->sxd[i][0] +
ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
aec->sxd[i][1] =
ptrGCoh[0] * aec->sxd[i][1] +
ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
sdSum += aec->sd[i];
seSum += aec->se[i];
}
// Divergent filter safeguard.
aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;
if (aec->divergeState)
memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1);
// Reset if error is significantly larger than nearend (13 dB).
if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum))
memset(aec->wfBuf, 0, sizeof(aec->wfBuf));
}
// Window time domain data to be used by the fft.
__inline static void WindowData(float* x_windowed, const float* x) {
int i;
for (i = 0; i < PART_LEN; i += 4) {
const float32x4_t vec_Buf1 = vld1q_f32(&x[i]);
const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]);
const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]);
// A B C D
float32x4_t vec_sqrtHanning_rev =
vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
// B A D C
vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev);
// D C B A
vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev),
vget_low_f32(vec_sqrtHanning_rev));
vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning));
vst1q_f32(&x_windowed[PART_LEN + i],
vmulq_f32(vec_Buf2, vec_sqrtHanning_rev));
}
}
// Puts fft output data into a complex valued array.
__inline static void StoreAsComplex(const float* data,
float data_complex[2][PART_LEN1]) {
int i;
for (i = 0; i < PART_LEN; i += 4) {
const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]);
vst1q_f32(&data_complex[0][i], vec_data.val[0]);
vst1q_f32(&data_complex[1][i], vec_data.val[1]);
}
// fix beginning/end values
data_complex[1][0] = 0;
data_complex[1][PART_LEN] = 0;
data_complex[0][0] = data[0];
data_complex[0][PART_LEN] = data[1];
}
static void SubbandCoherenceNEON(AecCore* aec,
float efw[2][PART_LEN1],
float xfw[2][PART_LEN1],
float* fft,
float* cohde,
float* cohxd) {
float dfw[2][PART_LEN1];
int i;
if (aec->delayEstCtr == 0)
aec->delayIdx = PartitionDelay(aec);
// Use delayed far.
memcpy(xfw,
aec->xfwBuf + aec->delayIdx * PART_LEN1,
sizeof(xfw[0][0]) * 2 * PART_LEN1);
// Windowed near fft
WindowData(fft, aec->dBuf);
aec_rdft_forward_128(fft);
StoreAsComplex(fft, dfw);
// Windowed error fft
WindowData(fft, aec->eBuf);
aec_rdft_forward_128(fft);
StoreAsComplex(fft, efw);
SmoothedPSD(aec, efw, dfw, xfw);
{
const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f);
// Subband coherence
for (i = 0; i + 3 < PART_LEN1; i += 4) {
const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]);
const float32x4_t vec_se = vld1q_f32(&aec->se[i]);
const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]);
const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se);
const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx);
float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]);
float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]);
vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]);
vec_cohde = vdivq_f32(vec_cohde, vec_sdse);
vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]);
vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx);
vst1q_f32(&cohde[i], vec_cohde);
vst1q_f32(&cohxd[i], vec_cohxd);
}
}
// scalar code for the remaining items.
for (; i < PART_LEN1; i++) {
cohde[i] =
(aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
(aec->sd[i] * aec->se[i] + 1e-10f);
cohxd[i] =
(aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
(aec->sx[i] * aec->sd[i] + 1e-10f);
}
}
void WebRtcAec_InitAec_neon(void) {
WebRtcAec_FilterFar = FilterFarNEON;
WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON;
WebRtcAec_FilterAdaptation = FilterAdaptationNEON;
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON;
WebRtcAec_SubbandCoherence = SubbandCoherenceNEON;
}

View File

@ -12,35 +12,33 @@
* The core AEC algorithm, SSE2 version of speed-critical functions.
*/
#include "typedefs.h"
#if defined(WEBRTC_USE_SSE2)
#include <emmintrin.h>
#include <math.h>
#include <string.h> // memset
#include "aec_core.h"
#include "aec_rdft.h"
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
#include "webrtc/modules/audio_processing/aec/aec_common.h"
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
__inline static float MulRe(float aRe, float aIm, float bRe, float bIm)
{
__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
return aRe * bRe - aIm * bIm;
}
__inline static float MulIm(float aRe, float aIm, float bRe, float bIm)
{
__inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
return aRe * bIm + aIm * bRe;
}
static void FilterFarSSE2(aec_t *aec, float yf[2][PART_LEN1])
{
static void FilterFarSSE2(AecCore* aec, float yf[2][PART_LEN1]) {
int i;
for (i = 0; i < NR_PART; i++) {
const int num_partitions = aec->num_partitions;
for (i = 0; i < num_partitions; i++) {
int j;
int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
int pos = i * PART_LEN1;
// Check for wrap
if (i + aec->xfBufBlockPos >= NR_PART) {
xPos -= NR_PART*(PART_LEN1);
if (i + aec->xfBufBlockPos >= num_partitions) {
xPos -= num_partitions * (PART_LEN1);
}
// vectorized code (four at once)
@ -64,19 +62,25 @@ static void FilterFarSSE2(aec_t *aec, float yf[2][PART_LEN1])
}
// scalar code for the remaining items.
for (; j < PART_LEN1; j++) {
yf[0][j] += MulRe(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j],
aec->wfBuf[0][ pos + j], aec->wfBuf[1][ pos + j]);
yf[1][j] += MulIm(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j],
aec->wfBuf[0][ pos + j], aec->wfBuf[1][ pos + j]);
yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],
aec->xfBuf[1][xPos + j],
aec->wfBuf[0][pos + j],
aec->wfBuf[1][pos + j]);
yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],
aec->xfBuf[1][xPos + j],
aec->wfBuf[0][pos + j],
aec->wfBuf[1][pos + j]);
}
}
}
static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
{
static void ScaleErrorSignalSSE2(AecCore* aec, float ef[2][PART_LEN1]) {
const __m128 k1e_10f = _mm_set1_ps(1e-10f);
const __m128 kThresh = _mm_set1_ps(aec->errThresh);
const __m128 kMu = _mm_set1_ps(aec->mu);
const __m128 kMu = aec->extended_filter_enabled ? _mm_set1_ps(kExtendedMu)
: _mm_set1_ps(aec->normal_mu);
const __m128 kThresh = aec->extended_filter_enabled
? _mm_set1_ps(kExtendedErrorThreshold)
: _mm_set1_ps(aec->normal_error_threshold);
int i;
// vectorized code (four at once)
@ -110,36 +114,46 @@ static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
_mm_storeu_ps(&ef[1][i], ef_im);
}
// scalar code for the remaining items.
for (; i < (PART_LEN1); i++) {
float absEf;
ef[0][i] /= (aec->xPow[i] + 1e-10f);
ef[1][i] /= (aec->xPow[i] + 1e-10f);
absEf = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
{
const float mu =
aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
const float error_threshold = aec->extended_filter_enabled
? kExtendedErrorThreshold
: aec->normal_error_threshold;
for (; i < (PART_LEN1); i++) {
float abs_ef;
ef[0][i] /= (aec->xPow[i] + 1e-10f);
ef[1][i] /= (aec->xPow[i] + 1e-10f);
abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
if (absEf > aec->errThresh) {
absEf = aec->errThresh / (absEf + 1e-10f);
ef[0][i] *= absEf;
ef[1][i] *= absEf;
if (abs_ef > error_threshold) {
abs_ef = error_threshold / (abs_ef + 1e-10f);
ef[0][i] *= abs_ef;
ef[1][i] *= abs_ef;
}
// Stepsize factor
ef[0][i] *= mu;
ef[1][i] *= mu;
}
// Stepsize factor
ef[0][i] *= aec->mu;
ef[1][i] *= aec->mu;
}
}
static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1]) {
static void FilterAdaptationSSE2(AecCore* aec,
float* fft,
float ef[2][PART_LEN1]) {
int i, j;
for (i = 0; i < NR_PART; i++) {
int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
const int num_partitions = aec->num_partitions;
for (i = 0; i < num_partitions; i++) {
int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1);
int pos = i * PART_LEN1;
// Check for wrap
if (i + aec->xfBufBlockPos >= NR_PART) {
xPos -= NR_PART * PART_LEN1;
if (i + aec->xfBufBlockPos >= num_partitions) {
xPos -= num_partitions * PART_LEN1;
}
// Process the whole array...
for (j = 0; j < PART_LEN; j+= 4) {
for (j = 0; j < PART_LEN; j += 4) {
// Load xfBuf and ef.
const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
@ -158,22 +172,23 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1])
const __m128 g = _mm_unpacklo_ps(e, f);
const __m128 h = _mm_unpackhi_ps(e, f);
// Store
_mm_storeu_ps(&fft[2*j + 0], g);
_mm_storeu_ps(&fft[2*j + 4], h);
_mm_storeu_ps(&fft[2 * j + 0], g);
_mm_storeu_ps(&fft[2 * j + 4], h);
}
// ... and fixup the first imaginary entry.
fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
-aec->xfBuf[1][xPos + PART_LEN],
ef[0][PART_LEN], ef[1][PART_LEN]);
ef[0][PART_LEN],
ef[1][PART_LEN]);
aec_rdft_inverse_128(fft);
memset(fft + PART_LEN, 0, sizeof(float)*PART_LEN);
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
// fft scaling
{
float scale = 2.0f / PART_LEN2;
const __m128 scale_ps = _mm_load_ps1(&scale);
for (j = 0; j < PART_LEN; j+=4) {
for (j = 0; j < PART_LEN; j += 4) {
const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
_mm_storeu_ps(&fft[j], fft_scale);
@ -184,13 +199,15 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1])
{
float wt1 = aec->wfBuf[1][pos];
aec->wfBuf[0][pos + PART_LEN] += fft[1];
for (j = 0; j < PART_LEN; j+= 4) {
for (j = 0; j < PART_LEN; j += 4) {
__m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
__m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
const __m128 fft_re = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2 ,0));
const __m128 fft_im = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3 ,1));
const __m128 fft_re =
_mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 fft_im =
_mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));
wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
_mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);
@ -201,8 +218,7 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1])
}
}
static __m128 mm_pow_ps(__m128 a, __m128 b)
{
static __m128 mm_pow_ps(__m128 a, __m128 b) {
// a^b = exp2(b * log2(a))
// exp2(x) and log2(x) are calculated using polynomial approximations.
__m128 log2_a, b_log2_a, a_exp_b;
@ -227,55 +243,55 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
// compensate the fact that the exponent has been shifted in the top/
// fractional part and finally getting rid of the implicit leading one
// from the mantissa by substracting it out.
static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END =
{0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END =
{0x43800000, 0x43800000, 0x43800000, 0x43800000};
static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END =
{0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = {
0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = {
0x43800000, 0x43800000, 0x43800000, 0x43800000};
static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = {
0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
static const int shift_exponent_into_top_mantissa = 8;
const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask));
const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(two_n),
shift_exponent_into_top_mantissa));
const __m128 n_0 = _mm_or_ps(n_1, *((__m128 *)eight_biased_exponent));
const __m128 n = _mm_sub_ps(n_0, *((__m128 *)implicit_leading_one));
const __m128 two_n = _mm_and_ps(a, *((__m128*)float_exponent_mask));
const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(
_mm_castps_si128(two_n), shift_exponent_into_top_mantissa));
const __m128 n_0 = _mm_or_ps(n_1, *((__m128*)eight_biased_exponent));
const __m128 n = _mm_sub_ps(n_0, *((__m128*)implicit_leading_one));
// Compute y.
static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END =
{0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END =
{0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask));
const __m128 y = _mm_or_ps(
mantissa, *((__m128 *)zero_biased_exponent_is_one));
static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = {
0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {
0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask));
const __m128 y =
_mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one));
// Approximate log2(y) ~= (y - 1) * pol5(y).
// pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
static const ALIGN16_BEG float ALIGN16_END C5[4] =
{-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
static const ALIGN16_BEG float ALIGN16_END C4[4] =
{3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
static const ALIGN16_BEG float ALIGN16_END C3[4] =
{-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
static const ALIGN16_BEG float ALIGN16_END C2[4] =
{2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
static const ALIGN16_BEG float ALIGN16_END C1[4] =
{-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
static const ALIGN16_BEG float ALIGN16_END C0[4] =
{3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128 *)C5));
const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4));
static const ALIGN16_BEG float ALIGN16_END C5[4] = {
-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
static const ALIGN16_BEG float ALIGN16_END
C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
static const ALIGN16_BEG float ALIGN16_END
C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
static const ALIGN16_BEG float ALIGN16_END
C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
static const ALIGN16_BEG float ALIGN16_END
C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
static const ALIGN16_BEG float ALIGN16_END
C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5));
const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4));
const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3));
const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3));
const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2));
const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2));
const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1));
const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1));
const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128 *)C0));
const __m128 y_minus_one = _mm_sub_ps(
y, *((__m128 *)zero_biased_exponent_is_one));
const __m128 log2_y = _mm_mul_ps(y_minus_one , pol5_y);
const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0));
const __m128 y_minus_one =
_mm_sub_ps(y, *((__m128*)zero_biased_exponent_is_one));
const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y);
// Combine parts.
log2_a = _mm_add_ps(n, log2_y);
@ -299,38 +315,38 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
// maximum relative error of 0.17%.
// To avoid over/underflow, we reduce the range of input to ]-127, 129].
static const ALIGN16_BEG float max_input[4] ALIGN16_END =
{129.f, 129.f, 129.f, 129.f};
static const ALIGN16_BEG float min_input[4] ALIGN16_END =
{-126.99999f, -126.99999f, -126.99999f, -126.99999f};
const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input));
const __m128 x_max = _mm_max_ps(x_min, *((__m128 *)min_input));
static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f,
129.f, 129.f};
static const ALIGN16_BEG float min_input[4] ALIGN16_END = {
-126.99999f, -126.99999f, -126.99999f, -126.99999f};
const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input));
const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input));
// Compute n.
static const ALIGN16_BEG float half[4] ALIGN16_END =
{0.5f, 0.5f, 0.5f, 0.5f};
const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half));
static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f,
0.5f, 0.5f};
const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half));
const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
// Compute 2^n.
static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END =
{127, 127, 127, 127};
static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {
127, 127, 127, 127};
static const int float_exponent_shift = 23;
const __m128i two_n_exponent = _mm_add_epi32(
x_minus_half_floor, *((__m128i *)float_exponent_bias));
const __m128 two_n = _mm_castsi128_ps(_mm_slli_epi32(
two_n_exponent, float_exponent_shift));
const __m128i two_n_exponent =
_mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias));
const __m128 two_n =
_mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift));
// Compute y.
const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
// Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
static const ALIGN16_BEG float C2[4] ALIGN16_END =
{3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
static const ALIGN16_BEG float C1[4] ALIGN16_END =
{6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
static const ALIGN16_BEG float C0[4] ALIGN16_END =
{1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f};
const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128 *)C2));
const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1));
static const ALIGN16_BEG float C2[4] ALIGN16_END = {
3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
static const ALIGN16_BEG float C1[4] ALIGN16_END = {
6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f,
1.0017247f, 1.0017247f};
const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128*)C2));
const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128*)C1));
const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128 *)C0));
const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128*)C0));
// Combine parts.
a_exp_b = _mm_mul_ps(exp2_y, two_n);
@ -338,10 +354,8 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
return a_exp_b;
}
extern const float WebRtcAec_weightCurve[65];
extern const float WebRtcAec_overDriveCurve[65];
static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
static void OverdriveAndSuppressSSE2(AecCore* aec,
float hNl[PART_LEN1],
const float hNlFb,
float efw[2][PART_LEN1]) {
int i;
@ -350,26 +364,25 @@ static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
// vectorized code (four at once)
for (i = 0; i + 3 < PART_LEN1; i+=4) {
for (i = 0; i + 3 < PART_LEN1; i += 4) {
// Weight subbands
__m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(
vec_weightCurve, vec_hNlFb);
const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(vec_weightCurve, vec_hNlFb);
const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
const __m128 vec_one_weightCurve_hNl = _mm_mul_ps(
vec_one_weightCurve, vec_hNl);
const __m128 vec_one_weightCurve_hNl =
_mm_mul_ps(vec_one_weightCurve, vec_hNl);
const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
const __m128 vec_if1 = _mm_and_ps(
bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
vec_hNl = _mm_or_ps(vec_if0, vec_if1);
{
const __m128 vec_overDriveCurve = _mm_loadu_ps(
&WebRtcAec_overDriveCurve[i]);
const __m128 vec_overDriveSm_overDriveCurve = _mm_mul_ps(
vec_overDriveSm, vec_overDriveCurve);
const __m128 vec_overDriveCurve =
_mm_loadu_ps(&WebRtcAec_overDriveCurve[i]);
const __m128 vec_overDriveSm_overDriveCurve =
_mm_mul_ps(vec_overDriveSm, vec_overDriveCurve);
vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
_mm_storeu_ps(&hNl[i], vec_hNl);
}
@ -393,7 +406,7 @@ static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
// Weight subbands
if (hNl[i] > hNlFb) {
hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
(1 - WebRtcAec_weightCurve[i]) * hNl[i];
(1 - WebRtcAec_weightCurve[i]) * hNl[i];
}
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
@ -407,11 +420,312 @@ static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
}
}
__inline static void _mm_add_ps_4x1(__m128 sum, float *dst) {
// A+B C+D
sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)));
// A+B+C+D A+B+C+D
sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
_mm_store_ss(dst, sum);
}
static int PartitionDelay(const AecCore* aec) {
// Measures the energy in each filter partition and returns the partition with
// highest energy.
// TODO(bjornv): Spread computational cost by computing one partition per
// block?
float wfEnMax = 0;
int i;
int delay = 0;
for (i = 0; i < aec->num_partitions; i++) {
int j;
int pos = i * PART_LEN1;
float wfEn = 0;
__m128 vec_wfEn = _mm_set1_ps(0.0f);
// vectorized code (four at once)
for (j = 0; j + 3 < PART_LEN1; j += 4) {
const __m128 vec_wfBuf0 = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
const __m128 vec_wfBuf1 = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0));
vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1));
}
_mm_add_ps_4x1(vec_wfEn, &wfEn);
// scalar code for the remaining items.
for (; j < PART_LEN1; j++) {
wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
}
if (wfEn > wfEnMax) {
wfEnMax = wfEn;
delay = i;
}
}
return delay;
}
// Updates the following smoothed Power Spectral Densities (PSD):
// - sd : near-end
// - se : residual echo
// - sx : far-end
// - sde : cross-PSD of near-end and residual echo
// - sxd : cross-PSD of near-end and far-end
//
// In addition to updating the PSDs, also the filter diverge state is determined
// upon actions are taken.
static void SmoothedPSD(AecCore* aec,
float efw[2][PART_LEN1],
float dfw[2][PART_LEN1],
float xfw[2][PART_LEN1]) {
// Power estimate smoothing coefficients.
const float* ptrGCoh = aec->extended_filter_enabled
? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
: WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
int i;
float sdSum = 0, seSum = 0;
const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD);
const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]);
const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]);
__m128 vec_sdSum = _mm_set1_ps(0.0f);
__m128 vec_seSum = _mm_set1_ps(0.0f);
for (i = 0; i + 3 < PART_LEN1; i += 4) {
const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]);
const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]);
const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]);
const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]);
const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]);
const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]);
__m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0);
__m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0);
__m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0);
__m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0);
__m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0);
__m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0);
vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1));
vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1));
vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1));
vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15);
vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1));
vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1));
vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1));
_mm_storeu_ps(&aec->sd[i], vec_sd);
_mm_storeu_ps(&aec->se[i], vec_se);
_mm_storeu_ps(&aec->sx[i], vec_sx);
{
const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]);
const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);
__m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,
_MM_SHUFFLE(2, 0, 2, 0));
__m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,
_MM_SHUFFLE(3, 1, 3, 1));
__m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0);
__m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1);
vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
vec_dfwefw0011 = _mm_add_ps(vec_dfwefw0011,
_mm_mul_ps(vec_dfw1, vec_efw1));
vec_dfwefw0110 = _mm_sub_ps(vec_dfwefw0110,
_mm_mul_ps(vec_dfw1, vec_efw0));
vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1));
vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1));
_mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b));
_mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));
}
{
const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]);
const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);
__m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,
_MM_SHUFFLE(2, 0, 2, 0));
__m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,
_MM_SHUFFLE(3, 1, 3, 1));
__m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0);
__m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1);
vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
vec_dfwxfw0011 = _mm_add_ps(vec_dfwxfw0011,
_mm_mul_ps(vec_dfw1, vec_xfw1));
vec_dfwxfw0110 = _mm_sub_ps(vec_dfwxfw0110,
_mm_mul_ps(vec_dfw1, vec_xfw0));
vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1));
vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1));
_mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b));
_mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));
}
vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd);
vec_seSum = _mm_add_ps(vec_seSum, vec_se);
}
_mm_add_ps_4x1(vec_sdSum, &sdSum);
_mm_add_ps_4x1(vec_seSum, &seSum);
for (; i < PART_LEN1; i++) {
aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
aec->se[i] = ptrGCoh[0] * aec->se[i] +
ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
// We threshold here to protect against the ill-effects of a zero farend.
// The threshold is not arbitrarily chosen, but balances protection and
// adverse interaction with the algorithm's tuning.
// TODO(bjornv): investigate further why this is so sensitive.
aec->sx[i] =
ptrGCoh[0] * aec->sx[i] +
ptrGCoh[1] * WEBRTC_SPL_MAX(
xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
WebRtcAec_kMinFarendPSD);
aec->sde[i][0] =
ptrGCoh[0] * aec->sde[i][0] +
ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
aec->sde[i][1] =
ptrGCoh[0] * aec->sde[i][1] +
ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
aec->sxd[i][0] =
ptrGCoh[0] * aec->sxd[i][0] +
ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
aec->sxd[i][1] =
ptrGCoh[0] * aec->sxd[i][1] +
ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
sdSum += aec->sd[i];
seSum += aec->se[i];
}
// Divergent filter safeguard.
aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;
if (aec->divergeState)
memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1);
// Reset if error is significantly larger than nearend (13 dB).
if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum))
memset(aec->wfBuf, 0, sizeof(aec->wfBuf));
}
// Window time domain data to be used by the fft.
__inline static void WindowData(float* x_windowed, const float* x) {
int i;
for (i = 0; i < PART_LEN; i += 4) {
const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]);
const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]);
const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]);
// A B C D
__m128 vec_sqrtHanning_rev =
_mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
// D C B A
vec_sqrtHanning_rev =
_mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev,
_MM_SHUFFLE(0, 1, 2, 3));
_mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning));
_mm_storeu_ps(&x_windowed[PART_LEN + i],
_mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev));
}
}
// Puts fft output data into a complex valued array.
__inline static void StoreAsComplex(const float* data,
float data_complex[2][PART_LEN1]) {
int i;
for (i = 0; i < PART_LEN; i += 4) {
const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]);
const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]);
const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4,
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4,
_MM_SHUFFLE(3, 1, 3, 1));
_mm_storeu_ps(&data_complex[0][i], vec_a);
_mm_storeu_ps(&data_complex[1][i], vec_b);
}
// fix beginning/end values
data_complex[1][0] = 0;
data_complex[1][PART_LEN] = 0;
data_complex[0][0] = data[0];
data_complex[0][PART_LEN] = data[1];
}
static void SubbandCoherenceSSE2(AecCore* aec,
float efw[2][PART_LEN1],
float xfw[2][PART_LEN1],
float* fft,
float* cohde,
float* cohxd) {
float dfw[2][PART_LEN1];
int i;
if (aec->delayEstCtr == 0)
aec->delayIdx = PartitionDelay(aec);
// Use delayed far.
memcpy(xfw,
aec->xfwBuf + aec->delayIdx * PART_LEN1,
sizeof(xfw[0][0]) * 2 * PART_LEN1);
// Windowed near fft
WindowData(fft, aec->dBuf);
aec_rdft_forward_128(fft);
StoreAsComplex(fft, dfw);
// Windowed error fft
WindowData(fft, aec->eBuf);
aec_rdft_forward_128(fft);
StoreAsComplex(fft, efw);
SmoothedPSD(aec, efw, dfw, xfw);
{
const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f);
// Subband coherence
for (i = 0; i + 3 < PART_LEN1; i += 4) {
const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]);
const __m128 vec_se = _mm_loadu_ps(&aec->se[i]);
const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]);
const __m128 vec_sdse = _mm_add_ps(vec_1eminus10,
_mm_mul_ps(vec_sd, vec_se));
const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10,
_mm_mul_ps(vec_sd, vec_sx));
const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]);
const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);
const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]);
const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);
const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,
_MM_SHUFFLE(3, 1, 3, 1));
const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,
_MM_SHUFFLE(2, 0, 2, 0));
const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,
_MM_SHUFFLE(3, 1, 3, 1));
__m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0);
__m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0);
vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1));
vec_cohde = _mm_div_ps(vec_cohde, vec_sdse);
vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1));
vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx);
_mm_storeu_ps(&cohde[i], vec_cohde);
_mm_storeu_ps(&cohxd[i], vec_cohxd);
}
// scalar code for the remaining items.
for (; i < PART_LEN1; i++) {
cohde[i] =
(aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
(aec->sd[i] * aec->se[i] + 1e-10f);
cohxd[i] =
(aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
(aec->sx[i] * aec->sd[i] + 1e-10f);
}
}
}
void WebRtcAec_InitAec_SSE2(void) {
WebRtcAec_FilterFar = FilterFarSSE2;
WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;
WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2;
}
#endif // WEBRTC_USE_SSE2

View File

@ -19,200 +19,193 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "aec_rdft.h"
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
#include <math.h>
#include "system_wrappers/interface/cpu_features_wrapper.h"
#include "typedefs.h"
#include "webrtc/system_wrappers/interface/cpu_features_wrapper.h"
#include "webrtc/typedefs.h"
// constants shared by all paths (C, SSE2).
float rdft_w[64];
// constants used by the C path.
float rdft_wk3ri_first[32];
float rdft_wk3ri_second[32];
// constants used by SSE2 but initialized in C path.
ALIGN16_BEG float ALIGN16_END rdft_wk1r[32];
ALIGN16_BEG float ALIGN16_END rdft_wk2r[32];
ALIGN16_BEG float ALIGN16_END rdft_wk3r[32];
ALIGN16_BEG float ALIGN16_END rdft_wk1i[32];
ALIGN16_BEG float ALIGN16_END rdft_wk2i[32];
ALIGN16_BEG float ALIGN16_END rdft_wk3i[32];
ALIGN16_BEG float ALIGN16_END cftmdl_wk1r[4];
// These tables used to be computed at run-time. For example, refer to:
// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/aec/aec_rdft.c?r=6564
// to see the initialization code.
const float rdft_w[64] = {
1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f,
0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f,
0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f,
0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f,
0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f,
0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f,
0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f,
0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f,
0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f,
0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f,
0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f,
0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f,
0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f,
0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f,
0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f,
0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f,
};
const float rdft_wk3ri_first[16] = {
1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f,
0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f,
0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f,
0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f,
};
const float rdft_wk3ri_second[16] = {
-0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f,
-0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f,
-0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f,
-0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f,
0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f,
0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f,
0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f,
0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f,
0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f,
0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f,
0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f,
0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f,
0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f,
0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f,
0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f,
0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f,
-0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f,
0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f,
0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f,
-0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
-0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
-0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
-0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f,
-0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f,
-0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
-0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f,
-0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f,
-0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
-0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f,
-0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f,
-0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
-0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f,
-0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
-0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f,
-0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f,
-0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
-0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
-0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f,
-0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f,
-0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f,
-0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
-0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f,
-0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f,
-0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f,
};
ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f,
};
static int ip[16];
static void bitrv2_128_C(float* a) {
/*
Following things have been attempted but are no faster:
(a) Storing the swap indexes in a LUT (index calculations are done
for 'free' while waiting on memory/L1).
(b) Consolidate the load/store of two consecutive floats by a 64 bit
integer (execution is memory/L1 bound).
(c) Do a mix of floats and 64 bit integer to maximize register
utilization (execution is memory/L1 bound).
(d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
(e) Hard-coding of the offsets to completely eliminates index
calculations.
*/
static void bitrv2_32or128(int n, int *ip, float *a) {
// n is 32 or 128
int j, j1, k, k1, m, m2;
unsigned int j, j1, k, k1;
float xr, xi, yr, yi;
ip[0] = 0;
{
int l = n;
m = 1;
while ((m << 3) < l) {
l >>= 1;
for (j = 0; j < m; j++) {
ip[m + j] = ip[j] + l;
}
m <<= 1;
}
}
m2 = 2 * m;
for (k = 0; k < m; k++) {
static const int ip[4] = {0, 64, 32, 96};
for (k = 0; k < 4; k++) {
for (j = 0; j < k; j++) {
j1 = 2 * j + ip[k];
k1 = 2 * k + ip[j];
xr = a[j1];
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
j1 += m2;
k1 += 2 * m2;
xr = a[j1];
j1 += 8;
k1 += 16;
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
j1 += m2;
k1 -= m2;
xr = a[j1];
j1 += 8;
k1 -= 8;
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
j1 += m2;
k1 += 2 * m2;
xr = a[j1];
j1 += 8;
k1 += 16;
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
}
j1 = 2 * k + m2 + ip[k];
k1 = j1 + m2;
xr = a[j1];
j1 = 2 * k + 8 + ip[k];
k1 = j1 + 8;
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
}
}
static void makewt_32(void) {
const int nw = 32;
int j, nwh;
float delta, x, y;
ip[0] = nw;
ip[1] = 1;
nwh = nw >> 1;
delta = atanf(1.0f) / nwh;
rdft_w[0] = 1;
rdft_w[1] = 0;
rdft_w[nwh] = cosf(delta * nwh);
rdft_w[nwh + 1] = rdft_w[nwh];
for (j = 2; j < nwh; j += 2) {
x = cosf(delta * j);
y = sinf(delta * j);
rdft_w[j] = x;
rdft_w[j + 1] = y;
rdft_w[nw - j] = y;
rdft_w[nw - j + 1] = x;
}
bitrv2_32or128(nw, ip + 2, rdft_w);
// pre-calculate constants used by cft1st_128 and cftmdl_128...
cftmdl_wk1r[0] = rdft_w[2];
cftmdl_wk1r[1] = rdft_w[2];
cftmdl_wk1r[2] = rdft_w[2];
cftmdl_wk1r[3] = -rdft_w[2];
{
int k1;
for (k1 = 0, j = 0; j < 128; j += 16, k1 += 2) {
const int k2 = 2 * k1;
const float wk2r = rdft_w[k1 + 0];
const float wk2i = rdft_w[k1 + 1];
float wk1r, wk1i;
// ... scalar version.
wk1r = rdft_w[k2 + 0];
wk1i = rdft_w[k2 + 1];
rdft_wk3ri_first[k1 + 0] = wk1r - 2 * wk2i * wk1i;
rdft_wk3ri_first[k1 + 1] = 2 * wk2i * wk1r - wk1i;
wk1r = rdft_w[k2 + 2];
wk1i = rdft_w[k2 + 3];
rdft_wk3ri_second[k1 + 0] = wk1r - 2 * wk2r * wk1i;
rdft_wk3ri_second[k1 + 1] = 2 * wk2r * wk1r - wk1i;
// ... vector version.
rdft_wk1r[k2 + 0] = rdft_w[k2 + 0];
rdft_wk1r[k2 + 1] = rdft_w[k2 + 0];
rdft_wk1r[k2 + 2] = rdft_w[k2 + 2];
rdft_wk1r[k2 + 3] = rdft_w[k2 + 2];
rdft_wk2r[k2 + 0] = rdft_w[k1 + 0];
rdft_wk2r[k2 + 1] = rdft_w[k1 + 0];
rdft_wk2r[k2 + 2] = -rdft_w[k1 + 1];
rdft_wk2r[k2 + 3] = -rdft_w[k1 + 1];
rdft_wk3r[k2 + 0] = rdft_wk3ri_first[k1 + 0];
rdft_wk3r[k2 + 1] = rdft_wk3ri_first[k1 + 0];
rdft_wk3r[k2 + 2] = rdft_wk3ri_second[k1 + 0];
rdft_wk3r[k2 + 3] = rdft_wk3ri_second[k1 + 0];
rdft_wk1i[k2 + 0] = -rdft_w[k2 + 1];
rdft_wk1i[k2 + 1] = rdft_w[k2 + 1];
rdft_wk1i[k2 + 2] = -rdft_w[k2 + 3];
rdft_wk1i[k2 + 3] = rdft_w[k2 + 3];
rdft_wk2i[k2 + 0] = -rdft_w[k1 + 1];
rdft_wk2i[k2 + 1] = rdft_w[k1 + 1];
rdft_wk2i[k2 + 2] = -rdft_w[k1 + 0];
rdft_wk2i[k2 + 3] = rdft_w[k1 + 0];
rdft_wk3i[k2 + 0] = -rdft_wk3ri_first[k1 + 1];
rdft_wk3i[k2 + 1] = rdft_wk3ri_first[k1 + 1];
rdft_wk3i[k2 + 2] = -rdft_wk3ri_second[k1 + 1];
rdft_wk3i[k2 + 3] = rdft_wk3ri_second[k1 + 1];
}
}
}
static void makect_32(void) {
float *c = rdft_w + 32;
const int nc = 32;
int j, nch;
float delta;
ip[1] = nc;
nch = nc >> 1;
delta = atanf(1.0f) / nch;
c[0] = cosf(delta * nch);
c[nch] = 0.5f * c[0];
for (j = 1; j < nch; j++) {
c[j] = 0.5f * cosf(delta * j);
c[nc - j] = 0.5f * sinf(delta * j);
}
}
static void cft1st_128_C(float *a) {
static void cft1st_128_C(float* a) {
const int n = 128;
int j, k1, k2;
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
// The processing of the first set of elements was simplified in C to avoid
// some operations (multiplication by zero or one, addition of two elements
// multiplied by the same weight, ...).
x0r = a[0] + a[2];
x0i = a[1] + a[3];
x1r = a[0] - a[2];
@ -311,7 +304,7 @@ static void cft1st_128_C(float *a) {
}
}
static void cftmdl_128_C(float *a) {
static void cftmdl_128_C(float* a) {
const int l = 8;
const int n = 128;
const int m = 32;
@ -320,7 +313,7 @@ static void cftmdl_128_C(float *a) {
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
for (j0 = 0; j0 < l; j0 += 2) {
j1 = j0 + 8;
j1 = j0 + 8;
j2 = j0 + 16;
j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0];
@ -342,7 +335,7 @@ static void cftmdl_128_C(float *a) {
}
wk1r = rdft_w[2];
for (j0 = m; j0 < l + m; j0 += 2) {
j1 = j0 + 8;
j1 = j0 + 8;
j2 = j0 + 16;
j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0];
@ -378,7 +371,7 @@ static void cftmdl_128_C(float *a) {
wk3r = rdft_wk3ri_first[k1 + 0];
wk3i = rdft_wk3ri_first[k1 + 1];
for (j0 = k; j0 < l + k; j0 += 2) {
j1 = j0 + 8;
j1 = j0 + 8;
j2 = j0 + 16;
j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0];
@ -409,7 +402,7 @@ static void cftmdl_128_C(float *a) {
wk3r = rdft_wk3ri_second[k1 + 0];
wk3i = rdft_wk3ri_second[k1 + 1];
for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
j1 = j0 + 8;
j1 = j0 + 8;
j2 = j0 + 16;
j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0];
@ -438,7 +431,7 @@ static void cftmdl_128_C(float *a) {
}
}
static void cftfsub_128(float *a) {
static void cftfsub_128_C(float* a) {
int j, j1, j2, j3, l;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
@ -468,7 +461,7 @@ static void cftfsub_128(float *a) {
}
}
static void cftbsub_128(float *a) {
static void cftbsub_128_C(float* a) {
int j, j1, j2, j3, l;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
@ -499,14 +492,14 @@ static void cftbsub_128(float *a) {
}
}
static void rftfsub_128_C(float *a) {
const float *c = rdft_w + 32;
static void rftfsub_128_C(float* a) {
const float* c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2;
k1 = 32 - j1;
k1 = 32 - j1;
wkr = 0.5f - c[k1];
wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0];
@ -520,15 +513,15 @@ static void rftfsub_128_C(float *a) {
}
}
static void rftbsub_128_C(float *a) {
const float *c = rdft_w + 32;
static void rftbsub_128_C(float* a) {
const float* c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
a[1] = -a[1];
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2;
k1 = 32 - j1;
k1 = 32 - j1;
wkr = 0.5f - c[k1];
wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0];
@ -543,11 +536,9 @@ static void rftbsub_128_C(float *a) {
a[65] = -a[65];
}
void aec_rdft_forward_128(float *a) {
const int n = 128;
void aec_rdft_forward_128(float* a) {
float xi;
bitrv2_32or128(n, ip + 2, a);
bitrv2_128(a);
cftfsub_128(a);
rftfsub_128(a);
xi = a[0] - a[1];
@ -555,33 +546,44 @@ void aec_rdft_forward_128(float *a) {
a[1] = xi;
}
void aec_rdft_inverse_128(float *a) {
const int n = 128;
void aec_rdft_inverse_128(float* a) {
a[1] = 0.5f * (a[0] - a[1]);
a[0] -= a[1];
rftbsub_128(a);
bitrv2_32or128(n, ip + 2, a);
bitrv2_128(a);
cftbsub_128(a);
}
// code path selection
rft_sub_128_t cft1st_128;
rft_sub_128_t cftmdl_128;
rft_sub_128_t rftfsub_128;
rft_sub_128_t rftbsub_128;
RftSub128 cft1st_128;
RftSub128 cftmdl_128;
RftSub128 rftfsub_128;
RftSub128 rftbsub_128;
RftSub128 cftfsub_128;
RftSub128 cftbsub_128;
RftSub128 bitrv2_128;
void aec_rdft_init(void) {
cft1st_128 = cft1st_128_C;
cftmdl_128 = cftmdl_128_C;
rftfsub_128 = rftfsub_128_C;
rftbsub_128 = rftbsub_128_C;
cftfsub_128 = cftfsub_128_C;
cftbsub_128 = cftbsub_128_C;
bitrv2_128 = bitrv2_128_C;
#if defined(WEBRTC_ARCH_X86_FAMILY)
if (WebRtc_GetCPUInfo(kSSE2)) {
#if defined(WEBRTC_USE_SSE2)
aec_rdft_init_sse2();
#endif
}
// init library constants.
makewt_32();
makect_32();
#endif
#if defined(MIPS_FPU_LE)
aec_rdft_init_mips();
#endif
#if defined(WEBRTC_HAS_NEON)
aec_rdft_init_neon();
#elif defined(WEBRTC_DETECT_NEON)
if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
aec_rdft_init_neon();
}
#endif
}

View File

@ -11,6 +11,8 @@
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
#include "webrtc/modules/audio_processing/aec/aec_common.h"
// These intrinsics were unavailable before VS 2008.
// TODO(andrew): move to a common file.
#if defined(_MSC_VER) && _MSC_VER < 1500
@ -19,39 +21,41 @@ static __inline __m128 _mm_castsi128_ps(__m128i a) { return *(__m128*)&a; }
static __inline __m128i _mm_castps_si128(__m128 a) { return *(__m128i*)&a; }
#endif
#ifdef _MSC_VER /* visual c++ */
# define ALIGN16_BEG __declspec(align(16))
# define ALIGN16_END
#else /* gcc or icc */
# define ALIGN16_BEG
# define ALIGN16_END __attribute__((aligned(16)))
#endif
// constants shared by all paths (C, SSE2).
extern float rdft_w[64];
// constants used by the C path.
extern float rdft_wk3ri_first[32];
extern float rdft_wk3ri_second[32];
// constants used by SSE2 but initialized in C path.
extern float rdft_wk1r[32];
extern float rdft_wk2r[32];
extern float rdft_wk3r[32];
extern float rdft_wk1i[32];
extern float rdft_wk2i[32];
extern float rdft_wk3i[32];
extern float cftmdl_wk1r[4];
// Constants shared by all paths (C, SSE2, NEON).
extern const float rdft_w[64];
// Constants used by the C path.
extern const float rdft_wk3ri_first[16];
extern const float rdft_wk3ri_second[16];
// Constants used by SSE2 and NEON but initialized in the C path.
extern ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32];
extern ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32];
extern ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32];
extern ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32];
extern ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32];
extern ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32];
extern ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4];
// code path selection function pointers
typedef void (*rft_sub_128_t)(float *a);
extern rft_sub_128_t rftfsub_128;
extern rft_sub_128_t rftbsub_128;
extern rft_sub_128_t cft1st_128;
extern rft_sub_128_t cftmdl_128;
typedef void (*RftSub128)(float* a);
extern RftSub128 rftfsub_128;
extern RftSub128 rftbsub_128;
extern RftSub128 cft1st_128;
extern RftSub128 cftmdl_128;
extern RftSub128 cftfsub_128;
extern RftSub128 cftbsub_128;
extern RftSub128 bitrv2_128;
// entry points
void aec_rdft_init(void);
void aec_rdft_init_sse2(void);
void aec_rdft_forward_128(float *a);
void aec_rdft_inverse_128(float *a);
void aec_rdft_forward_128(float* a);
void aec_rdft_inverse_128(float* a);
#if defined(MIPS_FPU_LE)
void aec_rdft_init_mips(void);
#endif
#if defined(WEBRTC_DETECT_NEON) || defined(WEBRTC_HAS_NEON)
void aec_rdft_init_neon(void);
#endif
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,355 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* The rdft AEC algorithm, neon version of speed-critical functions.
*
* Based on the sse2 version.
*/
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
#include <arm_neon.h>
static const ALIGN16_BEG float ALIGN16_END
k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
static void cft1st_128_neon(float* a) {
const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
int j, k2;
for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
float32x4_t a00v = vld1q_f32(&a[j + 0]);
float32x4_t a04v = vld1q_f32(&a[j + 4]);
float32x4_t a08v = vld1q_f32(&a[j + 8]);
float32x4_t a12v = vld1q_f32(&a[j + 12]);
float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
float32x4_t x0v = vaddq_f32(a01v, a23v);
const float32x4_t x1v = vsubq_f32(a01v, a23v);
const float32x4_t x2v = vaddq_f32(a45v, a67v);
const float32x4_t x3v = vsubq_f32(a45v, a67v);
const float32x4_t x3w = vrev64q_f32(x3v);
float32x4_t x0w;
a01v = vaddq_f32(x0v, x2v);
x0v = vsubq_f32(x0v, x2v);
x0w = vrev64q_f32(x0v);
a45v = vmulq_f32(wk2rv, x0v);
a45v = vmlaq_f32(a45v, wk2iv, x0w);
x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
x0w = vrev64q_f32(x0v);
a23v = vmulq_f32(wk1rv, x0v);
a23v = vmlaq_f32(a23v, wk1iv, x0w);
x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
x0w = vrev64q_f32(x0v);
a67v = vmulq_f32(wk3rv, x0v);
a67v = vmlaq_f32(a67v, wk3iv, x0w);
a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
vst1q_f32(&a[j + 0], a00v);
vst1q_f32(&a[j + 4], a04v);
vst1q_f32(&a[j + 8], a08v);
vst1q_f32(&a[j + 12], a12v);
}
}
static void cftmdl_128_neon(float* a) {
int j;
const int l = 8;
const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
for (j = 0; j < l; j += 2) {
const float32x2_t a_00 = vld1_f32(&a[j + 0]);
const float32x2_t a_08 = vld1_f32(&a[j + 8]);
const float32x2_t a_32 = vld1_f32(&a[j + 32]);
const float32x2_t a_40 = vld1_f32(&a[j + 40]);
const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
const float32x2_t a_16 = vld1_f32(&a[j + 16]);
const float32x2_t a_24 = vld1_f32(&a[j + 24]);
const float32x2_t a_48 = vld1_f32(&a[j + 48]);
const float32x2_t a_56 = vld1_f32(&a[j + 56]);
const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
const float32x4_t x1_x3_add =
vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
const float32x4_t x1_x3_sub =
vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
const float32x4_t xx1_rev = vrev64q_f32(xx1);
const float32x4_t yy4_rev = vrev64q_f32(yy4);
vst1_f32(&a[j + 0], vget_low_f32(xx0));
vst1_f32(&a[j + 32], vget_high_f32(xx0));
vst1_f32(&a[j + 16], vget_low_f32(xx1));
vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));
a[j + 48] = -a[j + 48];
vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
vst1_f32(&a[j + 40], vget_low_f32(yy4));
vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
}
{
const int k = 64;
const int k1 = 2;
const int k2 = 2 * k1;
const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
for (j = k; j < l + k; j += 2) {
const float32x2_t a_00 = vld1_f32(&a[j + 0]);
const float32x2_t a_08 = vld1_f32(&a[j + 8]);
const float32x2_t a_32 = vld1_f32(&a[j + 32]);
const float32x2_t a_40 = vld1_f32(&a[j + 40]);
const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
const float32x2_t a_16 = vld1_f32(&a[j + 16]);
const float32x2_t a_24 = vld1_f32(&a[j + 24]);
const float32x2_t a_48 = vld1_f32(&a[j + 48]);
const float32x2_t a_56 = vld1_f32(&a[j + 56]);
const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
const float32x4_t x1_x3_add =
vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
const float32x4_t x1_x3_sub =
vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));
vst1_f32(&a[j + 0], vget_low_f32(xx));
vst1_f32(&a[j + 32], vget_high_f32(xx));
vst1_f32(&a[j + 16], vget_low_f32(xx4));
vst1_f32(&a[j + 48], vget_high_f32(xx4));
vst1_f32(&a[j + 8], vget_low_f32(xx12));
vst1_f32(&a[j + 40], vget_high_f32(xx12));
vst1_f32(&a[j + 24], vget_low_f32(xx22));
vst1_f32(&a[j + 56], vget_high_f32(xx22));
}
}
}
__inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
// A B C D -> C D A B
const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
// C D A B -> D C B A
return vrev64q_f32(rev);
}
static void rftfsub_128_neon(float* a) {
const float* c = rdft_w + 32;
int j1, j2;
const float32x4_t mm_half = vdupq_n_f32(0.5f);
// Vectorized code (four at once).
// Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'.
const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'.
// 2, 4, 6, 8, 3, 5, 7, 9
float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
// 120, 122, 124, 126, 121, 123, 125, 127,
const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
// 126, 124, 122, 120
const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
// 127, 125, 123, 121
const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
// Calculate 'x'.
const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
// 2-126, 4-124, 6-122, 8-120,
const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
// 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'.
// yr = wkr * xr - wki * xi;
// yi = wkr * xi + wki * xr;
const float32x4_t a_ = vmulq_f32(wkr_, xr_);
const float32x4_t b_ = vmulq_f32(wki_, xi_);
const float32x4_t c_ = vmulq_f32(wkr_, xi_);
const float32x4_t d_ = vmulq_f32(wki_, xr_);
const float32x4_t yr_ = vsubq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const float32x4_t yi_ = vaddq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] -= yr;
// a[j2 + 1] -= yi;
// a[k2 + 0] += yr;
// a[k2 + 1] -= yi;
// 126, 124, 122, 120,
const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
// 127, 125, 123, 121,
const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);
// Shuffle in right order and store.
const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
// 124, 125, 126, 127, 120, 121, 122, 123
const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
// 2, 4, 6, 8,
a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
// 3, 5, 7, 9,
a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);
// 2, 3, 4, 5, 6, 7, 8, 9,
vst2q_f32(&a[0 + j2], a_j2_p);
vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
}
// Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) {
const int k2 = 128 - j2;
const int k1 = 32 - j1;
const float wkr = 0.5f - c[k1];
const float wki = c[j1];
const float xr = a[j2 + 0] - a[k2 + 0];
const float xi = a[j2 + 1] + a[k2 + 1];
const float yr = wkr * xr - wki * xi;
const float yi = wkr * xi + wki * xr;
a[j2 + 0] -= yr;
a[j2 + 1] -= yi;
a[k2 + 0] += yr;
a[k2 + 1] -= yi;
}
}
static void rftbsub_128_neon(float* a) {
const float* c = rdft_w + 32;
int j1, j2;
const float32x4_t mm_half = vdupq_n_f32(0.5f);
a[1] = -a[1];
// Vectorized code (four at once).
// Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'.
const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'.
// 2, 4, 6, 8, 3, 5, 7, 9
float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
// 120, 122, 124, 126, 121, 123, 125, 127,
const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
// 126, 124, 122, 120
const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
// 127, 125, 123, 121
const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
// Calculate 'x'.
const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
// 2-126, 4-124, 6-122, 8-120,
const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
// 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'.
// yr = wkr * xr - wki * xi;
// yi = wkr * xi + wki * xr;
const float32x4_t a_ = vmulq_f32(wkr_, xr_);
const float32x4_t b_ = vmulq_f32(wki_, xi_);
const float32x4_t c_ = vmulq_f32(wkr_, xi_);
const float32x4_t d_ = vmulq_f32(wki_, xr_);
const float32x4_t yr_ = vaddq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const float32x4_t yi_ = vsubq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] -= yr;
// a[j2 + 1] -= yi;
// a[k2 + 0] += yr;
// a[k2 + 1] -= yi;
// 126, 124, 122, 120,
const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
// 127, 125, 123, 121,
const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
// Shuffle in right order and store.
// 2, 3, 4, 5, 6, 7, 8, 9,
const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
// 124, 125, 126, 127, 120, 121, 122, 123
const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
// 2, 4, 6, 8,
a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
// 3, 5, 7, 9,
a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
// 2, 3, 4, 5, 6, 7, 8, 9,
vst2q_f32(&a[0 + j2], a_j2_p);
vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
}
// Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) {
const int k2 = 128 - j2;
const int k1 = 32 - j1;
const float wkr = 0.5f - c[k1];
const float wki = c[j1];
const float xr = a[j2 + 0] - a[k2 + 0];
const float xi = a[j2 + 1] + a[k2 + 1];
const float yr = wkr * xr + wki * xi;
const float yi = wkr * xi - wki * xr;
a[j2 + 0] = a[j2 + 0] - yr;
a[j2 + 1] = yi - a[j2 + 1];
a[k2 + 0] = yr + a[k2 + 0];
a[k2 + 1] = yi - a[k2 + 1];
}
a[65] = -a[65];
}
void aec_rdft_init_neon(void) {
cft1st_128 = cft1st_128_neon;
cftmdl_128 = cftmdl_128_neon;
rftfsub_128 = rftfsub_128_neon;
rftbsub_128 = rftbsub_128_neon;
}

View File

@ -8,172 +8,168 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "typedefs.h"
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
#if defined(WEBRTC_USE_SSE2)
#include <emmintrin.h>
#include "aec_rdft.h"
static const ALIGN16_BEG float ALIGN16_END
k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] =
{-1.f, 1.f, -1.f, 1.f};
static void cft1st_128_SSE2(float *a) {
static void cft1st_128_SSE2(float* a) {
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
int j, k2;
for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
__m128 a00v = _mm_loadu_ps(&a[j + 0]);
__m128 a04v = _mm_loadu_ps(&a[j + 4]);
__m128 a08v = _mm_loadu_ps(&a[j + 8]);
__m128 a12v = _mm_loadu_ps(&a[j + 12]);
__m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1 ,0));
__m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3 ,2));
__m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1 ,0));
__m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3 ,2));
__m128 a00v = _mm_loadu_ps(&a[j + 0]);
__m128 a04v = _mm_loadu_ps(&a[j + 4]);
__m128 a08v = _mm_loadu_ps(&a[j + 8]);
__m128 a12v = _mm_loadu_ps(&a[j + 12]);
__m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
__m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
__m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
__m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));
const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
__m128 x0v = _mm_add_ps(a01v, a23v);
const __m128 x1v = _mm_sub_ps(a01v, a23v);
const __m128 x2v = _mm_add_ps(a45v, a67v);
const __m128 x3v = _mm_sub_ps(a45v, a67v);
__m128 x0w;
a01v = _mm_add_ps(x0v, x2v);
x0v = _mm_sub_ps(x0v, x2v);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
__m128 x0v = _mm_add_ps(a01v, a23v);
const __m128 x1v = _mm_sub_ps(a01v, a23v);
const __m128 x2v = _mm_add_ps(a45v, a67v);
const __m128 x3v = _mm_sub_ps(a45v, a67v);
__m128 x0w;
a01v = _mm_add_ps(x0v, x2v);
x0v = _mm_sub_ps(x0v, x2v);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
{
const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
a45v = _mm_add_ps(a45_0v, a45_1v);
a45v = _mm_add_ps(a45_0v, a45_1v);
}
{
__m128 a23_0v, a23_1v;
const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0 ,1));
const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
x0v = _mm_add_ps(x1v, x3s);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
a23_0v = _mm_mul_ps(wk1rv, x0v);
a23_1v = _mm_mul_ps(wk1iv, x0w);
a23v = _mm_add_ps(a23_0v, a23_1v);
__m128 a23_0v, a23_1v;
const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
x0v = _mm_add_ps(x1v, x3s);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
a23_0v = _mm_mul_ps(wk1rv, x0v);
a23_1v = _mm_mul_ps(wk1iv, x0w);
a23v = _mm_add_ps(a23_0v, a23_1v);
x0v = _mm_sub_ps(x1v, x3s);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
x0v = _mm_sub_ps(x1v, x3s);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
}
{
const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
a67v = _mm_add_ps(a67_0v, a67_1v);
a67v = _mm_add_ps(a67_0v, a67_1v);
}
a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1 ,0));
a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1 ,0));
a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3 ,2));
a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3 ,2));
_mm_storeu_ps(&a[j + 0], a00v);
_mm_storeu_ps(&a[j + 4], a04v);
_mm_storeu_ps(&a[j + 8], a08v);
a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
_mm_storeu_ps(&a[j + 0], a00v);
_mm_storeu_ps(&a[j + 4], a04v);
_mm_storeu_ps(&a[j + 8], a08v);
_mm_storeu_ps(&a[j + 12], a12v);
}
}
static void cftmdl_128_SSE2(float *a) {
static void cftmdl_128_SSE2(float* a) {
const int l = 8;
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
int j0;
__m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
for (j0 = 0; j0 < l; j0 += 2) {
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
_mm_castsi128_ps(a_32),
_MM_SHUFFLE(1, 0, 1 ,0));
const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
_mm_castsi128_ps(a_40),
_MM_SHUFFLE(1, 0, 1 ,0));
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
_mm_castsi128_ps(a_32),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
_mm_castsi128_ps(a_40),
_MM_SHUFFLE(1, 0, 1, 0));
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
_mm_castsi128_ps(a_48),
_MM_SHUFFLE(1, 0, 1 ,0));
const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
_mm_castsi128_ps(a_56),
_MM_SHUFFLE(1, 0, 1 ,0));
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
_mm_castsi128_ps(a_48),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
_mm_castsi128_ps(a_56),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(
_mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1),
_MM_SHUFFLE(2, 3, 0, 1)));
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
_mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub,
_MM_SHUFFLE(2, 2, 2 ,2));
const __m128 yy1 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub,
_MM_SHUFFLE(3, 3, 3 ,3));
const __m128 yy0 =
_mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 yy1 =
_mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
const __m128 yy3 = _mm_add_ps(yy0, yy2);
const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
_mm_storel_epi64((__m128i*)&a[j0 + 32],
_mm_shuffle_epi32(_mm_castps_si128(xx0),
_MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
_mm_storel_epi64(
(__m128i*)&a[j0 + 32],
_mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
_mm_storel_epi64((__m128i*)&a[j0 + 48],
_mm_shuffle_epi32(_mm_castps_si128(xx1),
_MM_SHUFFLE(2, 3, 2, 3)));
_mm_storel_epi64(
(__m128i*)&a[j0 + 48],
_mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
a[j0 + 48] = -a[j0 + 48];
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
_mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
_mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
_mm_storel_epi64((__m128i*)&a[j0 + 56],
_mm_shuffle_epi32(_mm_castps_si128(yy4),
_MM_SHUFFLE(2, 3, 2, 3)));
_mm_storel_epi64(
(__m128i*)&a[j0 + 56],
_mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
}
{
int k = 64;
int k1 = 2;
int k2 = 2 * k1;
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2+0]);
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2+0]);
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2+0]);
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2+0]);
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2+0]);
wk1rv = _mm_load_ps(&rdft_wk1r[k2+0]);
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
for (j0 = k; j0 < l + k; j0 += 2) {
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
_mm_castsi128_ps(a_32),
_MM_SHUFFLE(1, 0, 1 ,0));
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
_mm_castsi128_ps(a_40),
_MM_SHUFFLE(1, 0, 1 ,0));
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
_MM_SHUFFLE(1, 0, 1, 0));
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
@ -182,100 +178,102 @@ static void cftmdl_128_SSE2(float *a) {
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
_mm_castsi128_ps(a_48),
_MM_SHUFFLE(1, 0, 1 ,0));
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
_mm_castsi128_ps(a_56),
_MM_SHUFFLE(1, 0, 1 ,0));
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx2 = _mm_mul_ps(xx1 , wk2rv);
const __m128 xx3 = _mm_mul_ps(wk2iv,
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1),
_MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
const __m128 xx3 =
_mm_mul_ps(wk2iv,
_mm_castsi128_ps(_mm_shuffle_epi32(
_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx4 = _mm_add_ps(xx2, xx3);
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(
_mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1),
_MM_SHUFFLE(2, 3, 0, 1)));
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
_mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
const __m128 xx11 = _mm_mul_ps(wk1iv,
const __m128 xx11 = _mm_mul_ps(
wk1iv,
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
_MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx12 = _mm_add_ps(xx10, xx11);
const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
const __m128 xx21 = _mm_mul_ps(wk3iv,
const __m128 xx21 = _mm_mul_ps(
wk3iv,
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
_MM_SHUFFLE(2, 3, 0, 1))));
_MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx22 = _mm_add_ps(xx20, xx21);
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
_mm_storel_epi64((__m128i*)&a[j0 + 32],
_mm_shuffle_epi32(_mm_castps_si128(xx),
_MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
_mm_storel_epi64(
(__m128i*)&a[j0 + 32],
_mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
_mm_storel_epi64((__m128i*)&a[j0 + 48],
_mm_shuffle_epi32(_mm_castps_si128(xx4),
_MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64(
(__m128i*)&a[j0 + 48],
_mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
_mm_storel_epi64((__m128i*)&a[j0 + 40],
_mm_shuffle_epi32(_mm_castps_si128(xx12),
_MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
_mm_storel_epi64(
(__m128i*)&a[j0 + 40],
_mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
_mm_storel_epi64((__m128i*)&a[j0 + 56],
_mm_shuffle_epi32(_mm_castps_si128(xx22),
_MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64(
(__m128i*)&a[j0 + 56],
_mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
}
}
}
static void rftfsub_128_SSE2(float *a) {
const float *c = rdft_w + 32;
static void rftfsub_128_SSE2(float* a) {
const float* c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
static const ALIGN16_BEG float ALIGN16_END k_half[4] =
{0.5f, 0.5f, 0.5f, 0.5f};
static const ALIGN16_BEG float ALIGN16_END
k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
const __m128 mm_half = _mm_load_ps(k_half);
// Vectorized code (four at once).
// Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'.
const __m128 c_j1 = _mm_loadu_ps(&c[ j1]); // 1, 2, 3, 4,
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
const __m128 wkr_ =
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'.
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4,
_MM_SHUFFLE(2, 0, 2 ,0)); // 2, 4, 6, 8,
const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4,
_MM_SHUFFLE(3, 1, 3 ,1)); // 3, 5, 7, 9,
const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0,
_MM_SHUFFLE(0, 2, 0 ,2)); // 126, 124, 122, 120,
const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0,
_MM_SHUFFLE(1, 3, 1 ,3)); // 127, 125, 123, 121,
const __m128 a_j2_p0 = _mm_shuffle_ps(
a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
const __m128 a_j2_p1 = _mm_shuffle_ps(
a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
const __m128 a_k2_p0 = _mm_shuffle_ps(
a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
const __m128 a_k2_p1 = _mm_shuffle_ps(
a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
// Calculate 'x'.
const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
// 2-126, 4-124, 6-122, 8-120,
// 2-126, 4-124, 6-122, 8-120,
const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
// 3-127, 5-125, 7-123, 9-121,
// 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'.
// yr = wkr * xr - wki * xi;
// yi = wkr * xi + wki * xr;
@ -283,12 +281,12 @@ static void rftfsub_128_SSE2(float *a) {
const __m128 b_ = _mm_mul_ps(wki_, xi_);
const __m128 c_ = _mm_mul_ps(wkr_, xi_);
const __m128 d_ = _mm_mul_ps(wki_, xr_);
const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] -= yr;
// a[j2 + 1] -= yi;
// a[k2 + 0] += yr;
const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] -= yr;
// a[j2 + 1] -= yi;
// a[k2 + 0] += yr;
// a[k2 + 1] -= yi;
const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_); // 3, 5, 7, 9,
@ -296,26 +294,26 @@ static void rftfsub_128_SSE2(float *a) {
const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_); // 127, 125, 123, 121,
// Shuffle in right order and store.
const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
// 2, 3, 4, 5,
// 2, 3, 4, 5,
const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
// 6, 7, 8, 9,
// 6, 7, 8, 9,
const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
// 122, 123, 120, 121,
// 122, 123, 120, 121,
const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
// 126, 127, 124, 125,
const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt,
_MM_SHUFFLE(1, 0, 3 ,2)); // 120, 121, 122, 123,
const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt,
_MM_SHUFFLE(1, 0, 3 ,2)); // 124, 125, 126, 127,
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
// 126, 127, 124, 125,
const __m128 a_k2_0n = _mm_shuffle_ps(
a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
const __m128 a_k2_4n = _mm_shuffle_ps(
a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
_mm_storeu_ps(&a[122 - j2], a_k2_0n);
_mm_storeu_ps(&a[126 - j2], a_k2_4n);
}
// Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2;
k1 = 32 - j1;
k1 = 32 - j1;
wkr = 0.5f - c[k1];
wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0];
@ -329,13 +327,13 @@ static void rftfsub_128_SSE2(float *a) {
}
}
static void rftbsub_128_SSE2(float *a) {
const float *c = rdft_w + 32;
static void rftbsub_128_SSE2(float* a) {
const float* c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
static const ALIGN16_BEG float ALIGN16_END k_half[4] =
{0.5f, 0.5f, 0.5f, 0.5f};
static const ALIGN16_BEG float ALIGN16_END
k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
const __m128 mm_half = _mm_load_ps(k_half);
a[1] = -a[1];
@ -343,30 +341,30 @@ static void rftbsub_128_SSE2(float *a) {
// Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'.
const __m128 c_j1 = _mm_loadu_ps(&c[ j1]); // 1, 2, 3, 4,
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
const __m128 wkr_ =
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'.
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4,
_MM_SHUFFLE(2, 0, 2 ,0)); // 2, 4, 6, 8,
const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4,
_MM_SHUFFLE(3, 1, 3 ,1)); // 3, 5, 7, 9,
const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0,
_MM_SHUFFLE(0, 2, 0 ,2)); // 126, 124, 122, 120,
const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0,
_MM_SHUFFLE(1, 3, 1 ,3)); // 127, 125, 123, 121,
const __m128 a_j2_p0 = _mm_shuffle_ps(
a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
const __m128 a_j2_p1 = _mm_shuffle_ps(
a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
const __m128 a_k2_p0 = _mm_shuffle_ps(
a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
const __m128 a_k2_p1 = _mm_shuffle_ps(
a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
// Calculate 'x'.
const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
// 2-126, 4-124, 6-122, 8-120,
// 2-126, 4-124, 6-122, 8-120,
const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
// 3-127, 5-125, 7-123, 9-121,
// 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'.
// yr = wkr * xr + wki * xi;
// yi = wkr * xi - wki * xr;
@ -374,12 +372,12 @@ static void rftbsub_128_SSE2(float *a) {
const __m128 b_ = _mm_mul_ps(wki_, xi_);
const __m128 c_ = _mm_mul_ps(wkr_, xi_);
const __m128 d_ = _mm_mul_ps(wki_, xr_);
const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] = a[j2 + 0] - yr;
// a[j2 + 1] = yi - a[j2 + 1];
// a[k2 + 0] = yr + a[k2 + 0];
const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] = a[j2 + 0] - yr;
// a[j2 + 1] = yi - a[j2 + 1];
// a[k2 + 0] = yr + a[k2 + 0];
// a[k2 + 1] = yi - a[k2 + 1];
const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9,
@ -387,26 +385,26 @@ static void rftbsub_128_SSE2(float *a) {
const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121,
// Shuffle in right order and store.
const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
// 2, 3, 4, 5,
// 2, 3, 4, 5,
const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
// 6, 7, 8, 9,
// 6, 7, 8, 9,
const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
// 122, 123, 120, 121,
// 122, 123, 120, 121,
const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
// 126, 127, 124, 125,
const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt,
_MM_SHUFFLE(1, 0, 3 ,2)); // 120, 121, 122, 123,
const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt,
_MM_SHUFFLE(1, 0, 3 ,2)); // 124, 125, 126, 127,
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
// 126, 127, 124, 125,
const __m128 a_k2_0n = _mm_shuffle_ps(
a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
const __m128 a_k2_4n = _mm_shuffle_ps(
a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
_mm_storeu_ps(&a[122 - j2], a_k2_0n);
_mm_storeu_ps(&a[126 - j2], a_k2_4n);
}
// Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2;
k1 = 32 - j1;
k1 = 32 - j1;
wkr = 0.5f - c[k1];
wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0];
@ -427,5 +425,3 @@ void aec_rdft_init_sse2(void) {
rftfsub_128 = rftfsub_128_SSE2;
rftbsub_128 = rftbsub_128_SSE2;
}
#endif // WEBRTC_USE_SS2

View File

@ -0,0 +1,209 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/* Resamples a signal to an arbitrary rate. Used by the AEC to compensate for
* clock skew by resampling the farend signal.
*/
#include "webrtc/modules/audio_processing/aec/aec_resampler.h"
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include "webrtc/modules/audio_processing/aec/aec_core.h"
enum {
kEstimateLengthFrames = 400
};
typedef struct {
float buffer[kResamplerBufferSize];
float position;
int deviceSampleRateHz;
int skewData[kEstimateLengthFrames];
int skewDataIndex;
float skewEstimate;
} AecResampler;
static int EstimateSkew(const int* rawSkew,
int size,
int absLimit,
float* skewEst);
void* WebRtcAec_CreateResampler() {
return malloc(sizeof(AecResampler));
}
int WebRtcAec_InitResampler(void* resampInst, int deviceSampleRateHz) {
AecResampler* obj = (AecResampler*)resampInst;
memset(obj->buffer, 0, sizeof(obj->buffer));
obj->position = 0.0;
obj->deviceSampleRateHz = deviceSampleRateHz;
memset(obj->skewData, 0, sizeof(obj->skewData));
obj->skewDataIndex = 0;
obj->skewEstimate = 0.0;
return 0;
}
void WebRtcAec_FreeResampler(void* resampInst) {
AecResampler* obj = (AecResampler*)resampInst;
free(obj);
}
void WebRtcAec_ResampleLinear(void* resampInst,
const float* inspeech,
size_t size,
float skew,
float* outspeech,
size_t* size_out) {
AecResampler* obj = (AecResampler*)resampInst;
float* y;
float be, tnew;
size_t tn, mm;
assert(size <= 2 * FRAME_LEN);
assert(resampInst != NULL);
assert(inspeech != NULL);
assert(outspeech != NULL);
assert(size_out != NULL);
// Add new frame data in lookahead
memcpy(&obj->buffer[FRAME_LEN + kResamplingDelay],
inspeech,
size * sizeof(inspeech[0]));
// Sample rate ratio
be = 1 + skew;
// Loop over input frame
mm = 0;
y = &obj->buffer[FRAME_LEN]; // Point at current frame
tnew = be * mm + obj->position;
tn = (size_t)tnew;
while (tn < size) {
// Interpolation
outspeech[mm] = y[tn] + (tnew - tn) * (y[tn + 1] - y[tn]);
mm++;
tnew = be * mm + obj->position;
tn = (int)tnew;
}
*size_out = mm;
obj->position += (*size_out) * be - size;
// Shift buffer
memmove(obj->buffer,
&obj->buffer[size],
(kResamplerBufferSize - size) * sizeof(obj->buffer[0]));
}
int WebRtcAec_GetSkew(void* resampInst, int rawSkew, float* skewEst) {
AecResampler* obj = (AecResampler*)resampInst;
int err = 0;
if (obj->skewDataIndex < kEstimateLengthFrames) {
obj->skewData[obj->skewDataIndex] = rawSkew;
obj->skewDataIndex++;
} else if (obj->skewDataIndex == kEstimateLengthFrames) {
err = EstimateSkew(
obj->skewData, kEstimateLengthFrames, obj->deviceSampleRateHz, skewEst);
obj->skewEstimate = *skewEst;
obj->skewDataIndex++;
} else {
*skewEst = obj->skewEstimate;
}
return err;
}
int EstimateSkew(const int* rawSkew,
int size,
int deviceSampleRateHz,
float* skewEst) {
const int absLimitOuter = (int)(0.04f * deviceSampleRateHz);
const int absLimitInner = (int)(0.0025f * deviceSampleRateHz);
int i = 0;
int n = 0;
float rawAvg = 0;
float err = 0;
float rawAbsDev = 0;
int upperLimit = 0;
int lowerLimit = 0;
float cumSum = 0;
float x = 0;
float x2 = 0;
float y = 0;
float xy = 0;
float xAvg = 0;
float denom = 0;
float skew = 0;
*skewEst = 0; // Set in case of error below.
for (i = 0; i < size; i++) {
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
n++;
rawAvg += rawSkew[i];
}
}
if (n == 0) {
return -1;
}
assert(n > 0);
rawAvg /= n;
for (i = 0; i < size; i++) {
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
err = rawSkew[i] - rawAvg;
rawAbsDev += err >= 0 ? err : -err;
}
}
assert(n > 0);
rawAbsDev /= n;
upperLimit = (int)(rawAvg + 5 * rawAbsDev + 1); // +1 for ceiling.
lowerLimit = (int)(rawAvg - 5 * rawAbsDev - 1); // -1 for floor.
n = 0;
for (i = 0; i < size; i++) {
if ((rawSkew[i] < absLimitInner && rawSkew[i] > -absLimitInner) ||
(rawSkew[i] < upperLimit && rawSkew[i] > lowerLimit)) {
n++;
cumSum += rawSkew[i];
x += n;
x2 += n * n;
y += cumSum;
xy += n * cumSum;
}
}
if (n == 0) {
return -1;
}
assert(n > 0);
xAvg = x / n;
denom = x2 - xAvg * x;
if (denom != 0) {
skew = (xy - xAvg * y) / denom;
}
*skewEst = skew;
return 0;
}

View File

@ -0,0 +1,39 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
#include "webrtc/modules/audio_processing/aec/aec_core.h"
enum {
kResamplingDelay = 1
};
enum {
kResamplerBufferSize = FRAME_LEN * 4
};
// Unless otherwise specified, functions return 0 on success and -1 on error.
void* WebRtcAec_CreateResampler(); // Returns NULL on error.
int WebRtcAec_InitResampler(void* resampInst, int deviceSampleRateHz);
void WebRtcAec_FreeResampler(void* resampInst);
// Estimates skew from raw measurement.
int WebRtcAec_GetSkew(void* resampInst, int rawSkew, float* skewEst);
// Resamples input using linear interpolation.
void WebRtcAec_ResampleLinear(void* resampInst,
const float* inspeech,
size_t size,
float skew,
float* outspeech,
size_t* size_out);
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,67 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
#include "webrtc/common_audio/ring_buffer.h"
#include "webrtc/modules/audio_processing/aec/aec_core.h"
typedef struct {
int delayCtr;
int sampFreq;
int splitSampFreq;
int scSampFreq;
float sampFactor; // scSampRate / sampFreq
short skewMode;
int bufSizeStart;
int knownDelay;
int rate_factor;
short initFlag; // indicates if AEC has been initialized
// Variables used for averaging far end buffer size
short counter;
int sum;
short firstVal;
short checkBufSizeCtr;
// Variables used for delay shifts
short msInSndCardBuf;
short filtDelay; // Filtered delay estimate.
int timeForDelayChange;
int startup_phase;
int checkBuffSize;
short lastDelayDiff;
#ifdef WEBRTC_AEC_DEBUG_DUMP
FILE* bufFile;
FILE* delayFile;
FILE* skewFile;
#endif
// Structures
void* resampler;
int skewFrCtr;
int resample; // if the skew is small enough we don't resample
int highSkewCtr;
float skew;
RingBuffer* far_pre_buf; // Time domain far-end pre-buffer.
int lastError;
int farend_started;
AecCore* aec;
} Aec;
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_

View File

@ -0,0 +1,245 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
#include <stddef.h>
#include "webrtc/typedefs.h"
// Errors
#define AEC_UNSPECIFIED_ERROR 12000
#define AEC_UNSUPPORTED_FUNCTION_ERROR 12001
#define AEC_UNINITIALIZED_ERROR 12002
#define AEC_NULL_POINTER_ERROR 12003
#define AEC_BAD_PARAMETER_ERROR 12004
// Warnings
#define AEC_BAD_PARAMETER_WARNING 12050
enum {
kAecNlpConservative = 0,
kAecNlpModerate,
kAecNlpAggressive
};
enum {
kAecFalse = 0,
kAecTrue
};
typedef struct {
int16_t nlpMode; // default kAecNlpModerate
int16_t skewMode; // default kAecFalse
int16_t metricsMode; // default kAecFalse
int delay_logging; // default kAecFalse
// float realSkew;
} AecConfig;
typedef struct {
int instant;
int average;
int max;
int min;
} AecLevel;
typedef struct {
AecLevel rerl;
AecLevel erl;
AecLevel erle;
AecLevel aNlp;
} AecMetrics;
struct AecCore;
#ifdef __cplusplus
extern "C" {
#endif
/*
* Allocates the memory needed by the AEC. The memory needs to be initialized
* separately using the WebRtcAec_Init() function. Returns a pointer to the
* object or NULL on error.
*/
void* WebRtcAec_Create();
/*
* This function releases the memory allocated by WebRtcAec_Create().
*
* Inputs Description
* -------------------------------------------------------------------
* void* aecInst Pointer to the AEC instance
*/
void WebRtcAec_Free(void* aecInst);
/*
* Initializes an AEC instance.
*
* Inputs Description
* -------------------------------------------------------------------
* void* aecInst Pointer to the AEC instance
* int32_t sampFreq Sampling frequency of data
* int32_t scSampFreq Soundcard sampling frequency
*
* Outputs Description
* -------------------------------------------------------------------
* int32_t return 0: OK
* -1: error
*/
int32_t WebRtcAec_Init(void* aecInst, int32_t sampFreq, int32_t scSampFreq);
/*
* Inserts an 80 or 160 sample block of data into the farend buffer.
*
* Inputs Description
* -------------------------------------------------------------------
* void* aecInst Pointer to the AEC instance
* const float* farend In buffer containing one frame of
* farend signal for L band
* int16_t nrOfSamples Number of samples in farend buffer
*
* Outputs Description
* -------------------------------------------------------------------
* int32_t return 0: OK
* -1: error
*/
int32_t WebRtcAec_BufferFarend(void* aecInst,
const float* farend,
size_t nrOfSamples);
/*
* Runs the echo canceller on an 80 or 160 sample blocks of data.
*
* Inputs Description
* -------------------------------------------------------------------
* void* aecInst Pointer to the AEC instance
* float* const* nearend In buffer containing one frame of
* nearend+echo signal for each band
* int num_bands Number of bands in nearend buffer
* int16_t nrOfSamples Number of samples in nearend buffer
* int16_t msInSndCardBuf Delay estimate for sound card and
* system buffers
* int16_t skew Difference between number of samples played
* and recorded at the soundcard (for clock skew
* compensation)
*
* Outputs Description
* -------------------------------------------------------------------
* float* const* out Out buffer, one frame of processed nearend
* for each band
* int32_t return 0: OK
* -1: error
*/
int32_t WebRtcAec_Process(void* aecInst,
const float* const* nearend,
size_t num_bands,
float* const* out,
size_t nrOfSamples,
int16_t msInSndCardBuf,
int32_t skew);
/*
* This function enables the user to set certain parameters on-the-fly.
*
* Inputs Description
* -------------------------------------------------------------------
* void* handle Pointer to the AEC instance
* AecConfig config Config instance that contains all
* properties to be set
*
* Outputs Description
* -------------------------------------------------------------------
* int return 0: OK
* -1: error
*/
int WebRtcAec_set_config(void* handle, AecConfig config);
/*
* Gets the current echo status of the nearend signal.
*
* Inputs Description
* -------------------------------------------------------------------
* void* handle Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* int* status 0: Almost certainly nearend single-talk
* 1: Might not be neared single-talk
* int return 0: OK
* -1: error
*/
int WebRtcAec_get_echo_status(void* handle, int* status);
/*
* Gets the current echo metrics for the session.
*
* Inputs Description
* -------------------------------------------------------------------
* void* handle Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* AecMetrics* metrics Struct which will be filled out with the
* current echo metrics.
* int return 0: OK
* -1: error
*/
int WebRtcAec_GetMetrics(void* handle, AecMetrics* metrics);
/*
* Gets the current delay metrics for the session.
*
* Inputs Description
* -------------------------------------------------------------------
* void* handle Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* int* median Delay median value.
* int* std Delay standard deviation.
* float* fraction_poor_delays Fraction of the delay estimates that may
* cause the AEC to perform poorly.
*
* int return 0: OK
* -1: error
*/
int WebRtcAec_GetDelayMetrics(void* handle,
int* median,
int* std,
float* fraction_poor_delays);
/*
* Gets the last error code.
*
* Inputs Description
* -------------------------------------------------------------------
* void* aecInst Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* int32_t return 11000-11100: error code
*/
int32_t WebRtcAec_get_error_code(void* aecInst);
// Returns a pointer to the low level AEC handle.
//
// Input:
// - handle : Pointer to the AEC instance.
//
// Return value:
// - AecCore pointer : NULL for error.
//
struct AecCore* WebRtcAec_aec_core(void* handle);
#ifdef __cplusplus
}
#endif
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_

View File

@ -1,278 +0,0 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_INTERFACE_ECHO_CANCELLATION_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_INTERFACE_ECHO_CANCELLATION_H_
#include "typedefs.h"
// Errors
#define AEC_UNSPECIFIED_ERROR 12000
#define AEC_UNSUPPORTED_FUNCTION_ERROR 12001
#define AEC_UNINITIALIZED_ERROR 12002
#define AEC_NULL_POINTER_ERROR 12003
#define AEC_BAD_PARAMETER_ERROR 12004
// Warnings
#define AEC_BAD_PARAMETER_WARNING 12050
enum {
kAecNlpConservative = 0,
kAecNlpModerate,
kAecNlpAggressive
};
enum {
kAecFalse = 0,
kAecTrue
};
typedef struct {
WebRtc_Word16 nlpMode; // default kAecNlpModerate
WebRtc_Word16 skewMode; // default kAecFalse
WebRtc_Word16 metricsMode; // default kAecFalse
int delay_logging; // default kAecFalse
//float realSkew;
} AecConfig;
typedef struct {
WebRtc_Word16 instant;
WebRtc_Word16 average;
WebRtc_Word16 max;
WebRtc_Word16 min;
} AecLevel;
typedef struct {
AecLevel rerl;
AecLevel erl;
AecLevel erle;
AecLevel aNlp;
} AecMetrics;
#ifdef __cplusplus
extern "C" {
#endif
/*
* Allocates the memory needed by the AEC. The memory needs to be initialized
* separately using the WebRtcAec_Init() function.
*
* Inputs Description
* -------------------------------------------------------------------
* void **aecInst Pointer to the AEC instance to be created
* and initialized
*
* Outputs Description
* -------------------------------------------------------------------
* WebRtc_Word32 return 0: OK
* -1: error
*/
WebRtc_Word32 WebRtcAec_Create(void **aecInst);
/*
* This function releases the memory allocated by WebRtcAec_Create().
*
* Inputs Description
* -------------------------------------------------------------------
* void *aecInst Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* WebRtc_Word32 return 0: OK
* -1: error
*/
WebRtc_Word32 WebRtcAec_Free(void *aecInst);
/*
* Initializes an AEC instance.
*
* Inputs Description
* -------------------------------------------------------------------
* void *aecInst Pointer to the AEC instance
* WebRtc_Word32 sampFreq Sampling frequency of data
* WebRtc_Word32 scSampFreq Soundcard sampling frequency
*
* Outputs Description
* -------------------------------------------------------------------
* WebRtc_Word32 return 0: OK
* -1: error
*/
WebRtc_Word32 WebRtcAec_Init(void *aecInst,
WebRtc_Word32 sampFreq,
WebRtc_Word32 scSampFreq);
/*
* Inserts an 80 or 160 sample block of data into the farend buffer.
*
* Inputs Description
* -------------------------------------------------------------------
* void *aecInst Pointer to the AEC instance
* WebRtc_Word16 *farend In buffer containing one frame of
* farend signal for L band
* WebRtc_Word16 nrOfSamples Number of samples in farend buffer
*
* Outputs Description
* -------------------------------------------------------------------
* WebRtc_Word32 return 0: OK
* -1: error
*/
WebRtc_Word32 WebRtcAec_BufferFarend(void *aecInst,
const WebRtc_Word16 *farend,
WebRtc_Word16 nrOfSamples);
/*
* Runs the echo canceller on an 80 or 160 sample blocks of data.
*
* Inputs Description
* -------------------------------------------------------------------
* void *aecInst Pointer to the AEC instance
* WebRtc_Word16 *nearend In buffer containing one frame of
* nearend+echo signal for L band
* WebRtc_Word16 *nearendH In buffer containing one frame of
* nearend+echo signal for H band
* WebRtc_Word16 nrOfSamples Number of samples in nearend buffer
* WebRtc_Word16 msInSndCardBuf Delay estimate for sound card and
* system buffers
* WebRtc_Word16 skew Difference between number of samples played
* and recorded at the soundcard (for clock skew
* compensation)
*
* Outputs Description
* -------------------------------------------------------------------
* WebRtc_Word16 *out Out buffer, one frame of processed nearend
* for L band
* WebRtc_Word16 *outH Out buffer, one frame of processed nearend
* for H band
* WebRtc_Word32 return 0: OK
* -1: error
*/
WebRtc_Word32 WebRtcAec_Process(void *aecInst,
const WebRtc_Word16 *nearend,
const WebRtc_Word16 *nearendH,
WebRtc_Word16 *out,
WebRtc_Word16 *outH,
WebRtc_Word16 nrOfSamples,
WebRtc_Word16 msInSndCardBuf,
WebRtc_Word32 skew);
/*
* This function enables the user to set certain parameters on-the-fly.
*
* Inputs Description
* -------------------------------------------------------------------
* void *aecInst Pointer to the AEC instance
* AecConfig config Config instance that contains all
* properties to be set
*
* Outputs Description
* -------------------------------------------------------------------
* WebRtc_Word32 return 0: OK
* -1: error
*/
WebRtc_Word32 WebRtcAec_set_config(void *aecInst, AecConfig config);
/*
* Gets the on-the-fly paramters.
*
* Inputs Description
* -------------------------------------------------------------------
* void *aecInst Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* AecConfig *config Pointer to the config instance that
* all properties will be written to
* WebRtc_Word32 return 0: OK
* -1: error
*/
WebRtc_Word32 WebRtcAec_get_config(void *aecInst, AecConfig *config);
/*
* Gets the current echo status of the nearend signal.
*
* Inputs Description
* -------------------------------------------------------------------
* void *aecInst Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* WebRtc_Word16 *status 0: Almost certainly nearend single-talk
* 1: Might not be neared single-talk
* WebRtc_Word32 return 0: OK
* -1: error
*/
WebRtc_Word32 WebRtcAec_get_echo_status(void *aecInst, WebRtc_Word16 *status);
/*
* Gets the current echo metrics for the session.
*
* Inputs Description
* -------------------------------------------------------------------
* void *aecInst Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* AecMetrics *metrics Struct which will be filled out with the
* current echo metrics.
* WebRtc_Word32 return 0: OK
* -1: error
*/
WebRtc_Word32 WebRtcAec_GetMetrics(void *aecInst, AecMetrics *metrics);
/*
* Gets the current delay metrics for the session.
*
* Inputs Description
* -------------------------------------------------------------------
* void* handle Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* int* median Delay median value.
* int* std Delay standard deviation.
*
* int return 0: OK
* -1: error
*/
int WebRtcAec_GetDelayMetrics(void* handle, int* median, int* std);
/*
* Gets the last error code.
*
* Inputs Description
* -------------------------------------------------------------------
* void *aecInst Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* WebRtc_Word32 return 11000-11100: error code
*/
WebRtc_Word32 WebRtcAec_get_error_code(void *aecInst);
/*
* Gets a version string.
*
* Inputs Description
* -------------------------------------------------------------------
* char *versionStr Pointer to a string array
* WebRtc_Word16 len The maximum length of the string
*
* Outputs Description
* -------------------------------------------------------------------
* WebRtc_Word8 *versionStr Pointer to a string array
* WebRtc_Word32 return 0: OK
* -1: error
*/
WebRtc_Word32 WebRtcAec_get_version(WebRtc_Word8 *versionStr, WebRtc_Word16 len);
#ifdef __cplusplus
}
#endif
#endif /* WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_INTERFACE_ECHO_CANCELLATION_H_ */

View File

@ -1,233 +0,0 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/* Resamples a signal to an arbitrary rate. Used by the AEC to compensate for clock
* skew by resampling the farend signal.
*/
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "resampler.h"
#include "aec_core.h"
enum { kFrameBufferSize = FRAME_LEN * 4 };
enum { kEstimateLengthFrames = 400 };
typedef struct {
short buffer[kFrameBufferSize];
float position;
int deviceSampleRateHz;
int skewData[kEstimateLengthFrames];
int skewDataIndex;
float skewEstimate;
} resampler_t;
static int EstimateSkew(const int* rawSkew,
int size,
int absLimit,
float *skewEst);
int WebRtcAec_CreateResampler(void **resampInst)
{
resampler_t *obj = malloc(sizeof(resampler_t));
*resampInst = obj;
if (obj == NULL) {
return -1;
}
return 0;
}
int WebRtcAec_InitResampler(void *resampInst, int deviceSampleRateHz)
{
resampler_t *obj = (resampler_t*) resampInst;
memset(obj->buffer, 0, sizeof(obj->buffer));
obj->position = 0.0;
obj->deviceSampleRateHz = deviceSampleRateHz;
memset(obj->skewData, 0, sizeof(obj->skewData));
obj->skewDataIndex = 0;
obj->skewEstimate = 0.0;
return 0;
}
int WebRtcAec_FreeResampler(void *resampInst)
{
resampler_t *obj = (resampler_t*) resampInst;
free(obj);
return 0;
}
int WebRtcAec_ResampleLinear(void *resampInst,
const short *inspeech,
int size,
float skew,
short *outspeech)
{
resampler_t *obj = (resampler_t*) resampInst;
short *y;
float be, tnew, interp;
int tn, outsize, mm;
if (size < 0 || size > 2 * FRAME_LEN) {
return -1;
}
// Add new frame data in lookahead
memcpy(&obj->buffer[FRAME_LEN + kResamplingDelay],
inspeech,
size * sizeof(short));
// Sample rate ratio
be = 1 + skew;
// Loop over input frame
mm = 0;
y = &obj->buffer[FRAME_LEN]; // Point at current frame
tnew = be * mm + obj->position;
tn = (int) tnew;
while (tn < size) {
// Interpolation
interp = y[tn] + (tnew - tn) * (y[tn+1] - y[tn]);
if (interp > 32767) {
interp = 32767;
}
else if (interp < -32768) {
interp = -32768;
}
outspeech[mm] = (short) interp;
mm++;
tnew = be * mm + obj->position;
tn = (int) tnew;
}
outsize = mm;
obj->position += outsize * be - size;
// Shift buffer
memmove(obj->buffer,
&obj->buffer[size],
(kFrameBufferSize - size) * sizeof(short));
return outsize;
}
int WebRtcAec_GetSkew(void *resampInst, int rawSkew, float *skewEst)
{
resampler_t *obj = (resampler_t*)resampInst;
int err = 0;
if (obj->skewDataIndex < kEstimateLengthFrames) {
obj->skewData[obj->skewDataIndex] = rawSkew;
obj->skewDataIndex++;
}
else if (obj->skewDataIndex == kEstimateLengthFrames) {
err = EstimateSkew(obj->skewData,
kEstimateLengthFrames,
obj->deviceSampleRateHz,
skewEst);
obj->skewEstimate = *skewEst;
obj->skewDataIndex++;
}
else {
*skewEst = obj->skewEstimate;
}
return err;
}
int EstimateSkew(const int* rawSkew,
int size,
int deviceSampleRateHz,
float *skewEst)
{
const int absLimitOuter = (int)(0.04f * deviceSampleRateHz);
const int absLimitInner = (int)(0.0025f * deviceSampleRateHz);
int i = 0;
int n = 0;
float rawAvg = 0;
float err = 0;
float rawAbsDev = 0;
int upperLimit = 0;
int lowerLimit = 0;
float cumSum = 0;
float x = 0;
float x2 = 0;
float y = 0;
float xy = 0;
float xAvg = 0;
float denom = 0;
float skew = 0;
*skewEst = 0; // Set in case of error below.
for (i = 0; i < size; i++) {
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
n++;
rawAvg += rawSkew[i];
}
}
if (n == 0) {
return -1;
}
assert(n > 0);
rawAvg /= n;
for (i = 0; i < size; i++) {
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
err = rawSkew[i] - rawAvg;
rawAbsDev += err >= 0 ? err : -err;
}
}
assert(n > 0);
rawAbsDev /= n;
upperLimit = (int)(rawAvg + 5 * rawAbsDev + 1); // +1 for ceiling.
lowerLimit = (int)(rawAvg - 5 * rawAbsDev - 1); // -1 for floor.
n = 0;
for (i = 0; i < size; i++) {
if ((rawSkew[i] < absLimitInner && rawSkew[i] > -absLimitInner) ||
(rawSkew[i] < upperLimit && rawSkew[i] > lowerLimit)) {
n++;
cumSum += rawSkew[i];
x += n;
x2 += n*n;
y += cumSum;
xy += n * cumSum;
}
}
if (n == 0) {
return -1;
}
assert(n > 0);
xAvg = x / n;
denom = x2 - xAvg*x;
if (denom != 0) {
skew = (xy - xAvg*y) / denom;
}
*skewEst = skew;
return 0;
}

View File

@ -1,32 +0,0 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_RESAMPLER_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_RESAMPLER_H_
enum { kResamplingDelay = 1 };
// Unless otherwise specified, functions return 0 on success and -1 on error
int WebRtcAec_CreateResampler(void **resampInst);
int WebRtcAec_InitResampler(void *resampInst, int deviceSampleRateHz);
int WebRtcAec_FreeResampler(void *resampInst);
// Estimates skew from raw measurement.
int WebRtcAec_GetSkew(void *resampInst, int rawSkew, float *skewEst);
// Resamples input using linear interpolation.
// Returns size of resampled array.
int WebRtcAec_ResampleLinear(void *resampInst,
const short *inspeech,
int size,
float skew,
short *outspeech);
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_RESAMPLER_H_