Update audio_processing module

Corresponds to upstream commit 524e9b043e7e86fd72353b987c9d5f6a1ebf83e1 Update notes: * Pull in third party license file * Replace .gypi files with BUILD.gn to keep track of what changes upstream * Bunch of new filse pulled in as dependencies * Won't build yet due to changes needed on top of these
2015-10-13 17:25:22 +05:30
parent 5ae7a5d6cd
commit 753eada3aa
324 changed files with 52533 additions and 16117 deletions
--- a/webrtc/modules/audio_processing/aec/Makefile.am
+++ b/webrtc/modules/audio_processing/aec/Makefile.am
@@ -1,16 +0,0 @@
-noinst_LTLIBRARIES = libaec.la
-
-libaec_la_SOURCES = interface/echo_cancellation.h \
-		    echo_cancellation.c \
-		    aec_core.h \
-		    aec_core.c \
-		    aec_core_sse2.c \
-		    aec_rdft.h \
-		    aec_rdft.c \
-		    aec_rdft_sse2.c \
-		    resampler.h \
-		    resampler.c
-libaec_la_CFLAGS = $(AM_CFLAGS) $(COMMON_CFLAGS) \
-		   -I$(top_srcdir)/src/common_audio/signal_processing_library/main/interface \
-		   -I$(top_srcdir)/src/system_wrappers/interface \
-		   -I$(top_srcdir)/src/modules/audio_processing/utility
--- a/webrtc/modules/audio_processing/aec/aec.gypi
+++ b/webrtc/modules/audio_processing/aec/aec.gypi
@@ -1,40 +0,0 @@
-# Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-{
-  'targets': [
-    {
-      'target_name': 'aec',
-      'type': '<(library)',
-      'dependencies': [
-        '<(webrtc_root)/common_audio/common_audio.gyp:spl',
-        'apm_util'
-      ],
-      'include_dirs': [
-        'interface',
-      ],
-      'direct_dependent_settings': {
-        'include_dirs': [
-          'interface',
-        ],
-      },
-      'sources': [
-        'interface/echo_cancellation.h',
-        'echo_cancellation.c',
-        'aec_core.h',
-        'aec_core.c',
-        'aec_core_sse2.c',
-        'aec_rdft.h',
-        'aec_rdft.c',
-        'aec_rdft_sse2.c',
-        'resampler.h',
-        'resampler.c',
-      ],
-    },
-  ],
-}
--- a/webrtc/modules/audio_processing/aec/aec_common.h
+++ b/webrtc/modules/audio_processing/aec/aec_common.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
+
+#include "webrtc/typedefs.h"
+
+#ifdef _MSC_VER /* visual c++ */
+#define ALIGN16_BEG __declspec(align(16))
+#define ALIGN16_END
+#else /* gcc or icc */
+#define ALIGN16_BEG
+#define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_sqrtHanning[65];
+extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_weightCurve[65];
+extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_overDriveCurve[65];
+extern const float WebRtcAec_kExtendedSmoothingCoefficients[2][2];
+extern const float WebRtcAec_kNormalSmoothingCoefficients[2][2];
+extern const float WebRtcAec_kMinFarendPSD;
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
+
--- a/webrtc/modules/audio_processing/aec/aec_core.c
+++ b/webrtc/modules/audio_processing/aec/aec_core.c
--- a/webrtc/modules/audio_processing/aec/aec_core.h
+++ b/webrtc/modules/audio_processing/aec/aec_core.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -12,29 +12,18 @@
 * Specifies the interface for the AEC core.
 */

-#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
-#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_

-#include <stdio.h>
+#include <stddef.h>

-#include "signal_processing_library.h"
-#include "typedefs.h"
-
-//#define AEC_DEBUG // for recording files
+#include "webrtc/typedefs.h"

 #define FRAME_LEN 80
-#define PART_LEN 64 // Length of partition
-#define PART_LEN1 (PART_LEN + 1) // Unique fft coefficients
-#define PART_LEN2 (PART_LEN * 2) // Length of partition * 2
-#define NR_PART 12 // Number of partitions
-#define FILT_LEN (PART_LEN * NR_PART) // Filter length
-#define FILT_LEN2 (FILT_LEN * 2) // Double filter length
-#define FAR_BUF_LEN (FILT_LEN2 * 2)
-#define PREF_BAND_SIZE 24
-
-#define BLOCKL_MAX FRAME_LEN
-// Maximum delay in fixed point delay estimator, used for logging
-enum {kMaxDelay = 100};
+#define PART_LEN 64               // Length of partition
+#define PART_LEN1 (PART_LEN + 1)  // Unique fft coefficients
+#define PART_LEN2 (PART_LEN * 2)  // Length of partition * 2
+#define NUM_HIGH_BANDS_MAX  2     // Max number of high bands

 typedef float complex_t[2];
 // For performance reasons, some arrays of complex numbers are replaced by twice
@@ -46,136 +35,95 @@ typedef float complex_t[2];
 // compile time.

 // Metrics
-enum {offsetLevel = -100};
+enum {
+  kOffsetLevel = -100
+};

-typedef struct {
-    float sfrsum;
-    int sfrcounter;
-    float framelevel;
-    float frsum;
-    int frcounter;
-    float minlevel;
-    float averagelevel;
-} power_level_t;
+typedef struct Stats {
+  float instant;
+  float average;
+  float min;
+  float max;
+  float sum;
+  float hisum;
+  float himean;
+  int counter;
+  int hicounter;
+} Stats;

-typedef struct {
-    float instant;
-    float average;
-    float min;
-    float max;
-    float sum;
-    float hisum;
-    float himean;
-    int counter;
-    int hicounter;
-} stats_t;
+typedef struct AecCore AecCore;

-typedef struct {
-    int farBufWritePos, farBufReadPos;
-
-    int knownDelay;
-    int inSamples, outSamples;
-    int delayEstCtr;
-
-    void *farFrBuf, *nearFrBuf, *outFrBuf;
-
-    void *nearFrBufH;
-    void *outFrBufH;
-
-    float xBuf[PART_LEN2]; // farend
-    float dBuf[PART_LEN2]; // nearend
-    float eBuf[PART_LEN2]; // error
-
-    float dBufH[PART_LEN2]; // nearend
-
-    float xPow[PART_LEN1];
-    float dPow[PART_LEN1];
-    float dMinPow[PART_LEN1];
-    float dInitMinPow[PART_LEN1];
-    float *noisePow;
-
-    float xfBuf[2][NR_PART * PART_LEN1]; // farend fft buffer
-    float wfBuf[2][NR_PART * PART_LEN1]; // filter fft
-    complex_t sde[PART_LEN1]; // cross-psd of nearend and error
-    complex_t sxd[PART_LEN1]; // cross-psd of farend and nearend
-    complex_t xfwBuf[NR_PART * PART_LEN1]; // farend windowed fft buffer
-
-    float sx[PART_LEN1], sd[PART_LEN1], se[PART_LEN1]; // far, near and error psd
-    float hNs[PART_LEN1];
-    float hNlFbMin, hNlFbLocalMin;
-    float hNlXdAvgMin;
-    int hNlNewMin, hNlMinCtr;
-    float overDrive, overDriveSm;
-    float targetSupp, minOverDrive;
-    float outBuf[PART_LEN];
-    int delayIdx;
-
-    short stNearState, echoState;
-    short divergeState;
-
-    int xfBufBlockPos;
-
-    short farBuf[FILT_LEN2 * 2];
-
-    short mult; // sampling frequency multiple
-    int sampFreq;
-    WebRtc_UWord32 seed;
-
-    float mu; // stepsize
-    float errThresh; // error threshold
-
-    int noiseEstCtr;
-
-    power_level_t farlevel;
-    power_level_t nearlevel;
-    power_level_t linoutlevel;
-    power_level_t nlpoutlevel;
-
-    int metricsMode;
-    int stateCounter;
-    stats_t erl;
-    stats_t erle;
-    stats_t aNlp;
-    stats_t rerl;
-
-    // Quantities to control H band scaling for SWB input
-    int freq_avg_ic;         //initial bin for averaging nlp gain
-    int flag_Hband_cn;      //for comfort noise
-    float cn_scale_Hband;   //scale for comfort noise in H band
-
-    int delay_histogram[kMaxDelay];
-    int delay_logging_enabled;
-    void* delay_estimator;
-
-#ifdef AEC_DEBUG
-    FILE *farFile;
-    FILE *nearFile;
-    FILE *outFile;
-    FILE *outLpFile;
-#endif
-} aec_t;
-
-typedef void (*WebRtcAec_FilterFar_t)(aec_t *aec, float yf[2][PART_LEN1]);
-extern WebRtcAec_FilterFar_t WebRtcAec_FilterFar;
-typedef void (*WebRtcAec_ScaleErrorSignal_t)(aec_t *aec, float ef[2][PART_LEN1]);
-extern WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal;
-typedef void (*WebRtcAec_FilterAdaptation_t)
-  (aec_t *aec, float *fft, float ef[2][PART_LEN1]);
-extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
-typedef void (*WebRtcAec_OverdriveAndSuppress_t)
-  (aec_t *aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]);
-extern WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;
-
-int WebRtcAec_CreateAec(aec_t **aec);
-int WebRtcAec_FreeAec(aec_t *aec);
-int WebRtcAec_InitAec(aec_t *aec, int sampFreq);
+AecCore* WebRtcAec_CreateAec();  // Returns NULL on error.
+void WebRtcAec_FreeAec(AecCore* aec);
+int WebRtcAec_InitAec(AecCore* aec, int sampFreq);
 void WebRtcAec_InitAec_SSE2(void);
+#if defined(MIPS_FPU_LE)
+void WebRtcAec_InitAec_mips(void);
+#endif
+#if defined(WEBRTC_DETECT_NEON) || defined(WEBRTC_HAS_NEON)
+void WebRtcAec_InitAec_neon(void);
+#endif

-void WebRtcAec_InitMetrics(aec_t *aec);
-void WebRtcAec_ProcessFrame(aec_t *aec, const short *farend,
-                       const short *nearend, const short *nearendH,
-                       short *out, short *outH,
-                       int knownDelay);
+void WebRtcAec_BufferFarendPartition(AecCore* aec, const float* farend);
+void WebRtcAec_ProcessFrames(AecCore* aec,
+                             const float* const* nearend,
+                             size_t num_bands,
+                             size_t num_samples,
+                             int knownDelay,
+                             float* const* out);

-#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
+// A helper function to call WebRtc_MoveReadPtr() for all far-end buffers.
+// Returns the number of elements moved, and adjusts |system_delay| by the
+// corresponding amount in ms.
+int WebRtcAec_MoveFarReadPtr(AecCore* aec, int elements);

+// Calculates the median, standard deviation and amount of poor values among the
+// delay estimates aggregated up to the first call to the function. After that
+// first call the metrics are aggregated and updated every second. With poor
+// values we mean values that most likely will cause the AEC to perform poorly.
+// TODO(bjornv): Consider changing tests and tools to handle constant
+// constant aggregation window throughout the session instead.
+int WebRtcAec_GetDelayMetricsCore(AecCore* self, int* median, int* std,
+                                  float* fraction_poor_delays);
+
+// Returns the echo state (1: echo, 0: no echo).
+int WebRtcAec_echo_state(AecCore* self);
+
+// Gets statistics of the echo metrics ERL, ERLE, A_NLP.
+void WebRtcAec_GetEchoStats(AecCore* self,
+                            Stats* erl,
+                            Stats* erle,
+                            Stats* a_nlp);
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+void* WebRtcAec_far_time_buf(AecCore* self);
+#endif
+
+// Sets local configuration modes.
+void WebRtcAec_SetConfigCore(AecCore* self,
+                             int nlp_mode,
+                             int metrics_mode,
+                             int delay_logging);
+
+// Non-zero enables, zero disables.
+void WebRtcAec_enable_delay_agnostic(AecCore* self, int enable);
+
+// Returns non-zero if delay agnostic (i.e., signal based delay estimation) is
+// enabled and zero if disabled.
+int WebRtcAec_delay_agnostic_enabled(AecCore* self);
+
+// Enables or disables extended filter mode. Non-zero enables, zero disables.
+void WebRtcAec_enable_extended_filter(AecCore* self, int enable);
+
+// Returns non-zero if extended filter mode is enabled and zero if disabled.
+int WebRtcAec_extended_filter_enabled(AecCore* self);
+
+// Returns the current |system_delay|, i.e., the buffered difference between
+// far-end and near-end.
+int WebRtcAec_system_delay(AecCore* self);
+
+// Sets the |system_delay| to |value|.  Note that if the value is changed
+// improperly, there can be a performance regression.  So it should be used with
+// care.
+void WebRtcAec_SetSystemDelay(AecCore* self, int delay);
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_
--- a/webrtc/modules/audio_processing/aec/aec_core_internal.h
+++ b/webrtc/modules/audio_processing/aec/aec_core_internal.h
@@ -0,0 +1,202 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
+
+#include "webrtc/common_audio/ring_buffer.h"
+#include "webrtc/common_audio/wav_file.h"
+#include "webrtc/modules/audio_processing/aec/aec_common.h"
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+#include "webrtc/typedefs.h"
+
+// Number of partitions for the extended filter mode. The first one is an enum
+// to be used in array declarations, as it represents the maximum filter length.
+enum {
+  kExtendedNumPartitions = 32
+};
+static const int kNormalNumPartitions = 12;
+
+// Delay estimator constants, used for logging and delay compensation if
+// if reported delays are disabled.
+enum {
+  kLookaheadBlocks = 15
+};
+enum {
+  // 500 ms for 16 kHz which is equivalent with the limit of reported delays.
+  kHistorySizeBlocks = 125
+};
+
+// Extended filter adaptation parameters.
+// TODO(ajm): No narrowband tuning yet.
+static const float kExtendedMu = 0.4f;
+static const float kExtendedErrorThreshold = 1.0e-6f;
+
+typedef struct PowerLevel {
+  float sfrsum;
+  int sfrcounter;
+  float framelevel;
+  float frsum;
+  int frcounter;
+  float minlevel;
+  float averagelevel;
+} PowerLevel;
+
+struct AecCore {
+  int farBufWritePos, farBufReadPos;
+
+  int knownDelay;
+  int inSamples, outSamples;
+  int delayEstCtr;
+
+  RingBuffer* nearFrBuf;
+  RingBuffer* outFrBuf;
+
+  RingBuffer* nearFrBufH[NUM_HIGH_BANDS_MAX];
+  RingBuffer* outFrBufH[NUM_HIGH_BANDS_MAX];
+
+  float dBuf[PART_LEN2];  // nearend
+  float eBuf[PART_LEN2];  // error
+
+  float dBufH[NUM_HIGH_BANDS_MAX][PART_LEN2];  // nearend
+
+  float xPow[PART_LEN1];
+  float dPow[PART_LEN1];
+  float dMinPow[PART_LEN1];
+  float dInitMinPow[PART_LEN1];
+  float* noisePow;
+
+  float xfBuf[2][kExtendedNumPartitions * PART_LEN1];  // farend fft buffer
+  float wfBuf[2][kExtendedNumPartitions * PART_LEN1];  // filter fft
+  complex_t sde[PART_LEN1];  // cross-psd of nearend and error
+  complex_t sxd[PART_LEN1];  // cross-psd of farend and nearend
+  // Farend windowed fft buffer.
+  complex_t xfwBuf[kExtendedNumPartitions * PART_LEN1];
+
+  float sx[PART_LEN1], sd[PART_LEN1], se[PART_LEN1];  // far, near, error psd
+  float hNs[PART_LEN1];
+  float hNlFbMin, hNlFbLocalMin;
+  float hNlXdAvgMin;
+  int hNlNewMin, hNlMinCtr;
+  float overDrive, overDriveSm;
+  int nlp_mode;
+  float outBuf[PART_LEN];
+  int delayIdx;
+
+  short stNearState, echoState;
+  short divergeState;
+
+  int xfBufBlockPos;
+
+  RingBuffer* far_buf;
+  RingBuffer* far_buf_windowed;
+  int system_delay;  // Current system delay buffered in AEC.
+
+  int mult;  // sampling frequency multiple
+  int sampFreq;
+  size_t num_bands;
+  uint32_t seed;
+
+  float normal_mu;               // stepsize
+  float normal_error_threshold;  // error threshold
+
+  int noiseEstCtr;
+
+  PowerLevel farlevel;
+  PowerLevel nearlevel;
+  PowerLevel linoutlevel;
+  PowerLevel nlpoutlevel;
+
+  int metricsMode;
+  int stateCounter;
+  Stats erl;
+  Stats erle;
+  Stats aNlp;
+  Stats rerl;
+
+  // Quantities to control H band scaling for SWB input
+  int freq_avg_ic;       // initial bin for averaging nlp gain
+  int flag_Hband_cn;     // for comfort noise
+  float cn_scale_Hband;  // scale for comfort noise in H band
+
+  int delay_metrics_delivered;
+  int delay_histogram[kHistorySizeBlocks];
+  int num_delay_values;
+  int delay_median;
+  int delay_std;
+  float fraction_poor_delays;
+  int delay_logging_enabled;
+  void* delay_estimator_farend;
+  void* delay_estimator;
+  // Variables associated with delay correction through signal based delay
+  // estimation feedback.
+  int signal_delay_correction;
+  int previous_delay;
+  int delay_correction_count;
+  int shift_offset;
+  float delay_quality_threshold;
+  int frame_count;
+
+  // 0 = delay agnostic mode (signal based delay correction) disabled.
+  // Otherwise enabled.
+  int delay_agnostic_enabled;
+  // 1 = extended filter mode enabled, 0 = disabled.
+  int extended_filter_enabled;
+  // Runtime selection of number of filter partitions.
+  int num_partitions;
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+  // Sequence number of this AEC instance, so that different instances can
+  // choose different dump file names.
+  int instance_index;
+
+  // Number of times we've restarted dumping; used to pick new dump file names
+  // each time.
+  int debug_dump_count;
+
+  RingBuffer* far_time_buf;
+  rtc_WavWriter* farFile;
+  rtc_WavWriter* nearFile;
+  rtc_WavWriter* outFile;
+  rtc_WavWriter* outLinearFile;
+  FILE* e_fft_file;
+#endif
+};
+
+typedef void (*WebRtcAecFilterFar)(AecCore* aec, float yf[2][PART_LEN1]);
+extern WebRtcAecFilterFar WebRtcAec_FilterFar;
+typedef void (*WebRtcAecScaleErrorSignal)(AecCore* aec, float ef[2][PART_LEN1]);
+extern WebRtcAecScaleErrorSignal WebRtcAec_ScaleErrorSignal;
+typedef void (*WebRtcAecFilterAdaptation)(AecCore* aec,
+                                          float* fft,
+                                          float ef[2][PART_LEN1]);
+extern WebRtcAecFilterAdaptation WebRtcAec_FilterAdaptation;
+typedef void (*WebRtcAecOverdriveAndSuppress)(AecCore* aec,
+                                              float hNl[PART_LEN1],
+                                              const float hNlFb,
+                                              float efw[2][PART_LEN1]);
+extern WebRtcAecOverdriveAndSuppress WebRtcAec_OverdriveAndSuppress;
+
+typedef void (*WebRtcAecComfortNoise)(AecCore* aec,
+                                      float efw[2][PART_LEN1],
+                                      complex_t* comfortNoiseHband,
+                                      const float* noisePow,
+                                      const float* lambda);
+extern WebRtcAecComfortNoise WebRtcAec_ComfortNoise;
+
+typedef void (*WebRtcAecSubBandCoherence)(AecCore* aec,
+                                          float efw[2][PART_LEN1],
+                                          float xfw[2][PART_LEN1],
+                                          float* fft,
+                                          float* cohde,
+                                          float* cohxd);
+extern WebRtcAecSubBandCoherence WebRtcAec_SubbandCoherence;
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
--- a/webrtc/modules/audio_processing/aec/aec_core_mips.c
+++ b/webrtc/modules/audio_processing/aec/aec_core_mips.c
@@ -0,0 +1,774 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * The core AEC algorithm, which is presented with time-aligned signals.
+ */
+
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+
+#include <math.h>
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+
+static const int flagHbandCn = 1; // flag for adding comfort noise in H band
+extern const float WebRtcAec_weightCurve[65];
+extern const float WebRtcAec_overDriveCurve[65];
+
+void WebRtcAec_ComfortNoise_mips(AecCore* aec,
+                                 float efw[2][PART_LEN1],
+                                 complex_t* comfortNoiseHband,
+                                 const float* noisePow,
+                                 const float* lambda) {
+  int i, num;
+  float rand[PART_LEN];
+  float noise, noiseAvg, tmp, tmpAvg;
+  int16_t randW16[PART_LEN];
+  complex_t u[PART_LEN1];
+
+  const float pi2 = 6.28318530717959f;
+  const float pi2t = pi2 / 32768;
+
+  // Generate a uniform random array on [0 1]
+  WebRtcSpl_RandUArray(randW16, PART_LEN, &aec->seed);
+
+  int16_t* randWptr = randW16;
+  float randTemp, randTemp2, randTemp3, randTemp4;
+  int32_t tmp1s, tmp2s, tmp3s, tmp4s;
+
+  for (i = 0; i < PART_LEN; i+=4) {
+    __asm __volatile (
+      ".set     push                                           \n\t"
+      ".set     noreorder                                      \n\t"
+      "lh       %[tmp1s],       0(%[randWptr])                 \n\t"
+      "lh       %[tmp2s],       2(%[randWptr])                 \n\t"
+      "lh       %[tmp3s],       4(%[randWptr])                 \n\t"
+      "lh       %[tmp4s],       6(%[randWptr])                 \n\t"
+      "mtc1     %[tmp1s],       %[randTemp]                    \n\t"
+      "mtc1     %[tmp2s],       %[randTemp2]                   \n\t"
+      "mtc1     %[tmp3s],       %[randTemp3]                   \n\t"
+      "mtc1     %[tmp4s],       %[randTemp4]                   \n\t"
+      "cvt.s.w  %[randTemp],    %[randTemp]                    \n\t"
+      "cvt.s.w  %[randTemp2],   %[randTemp2]                   \n\t"
+      "cvt.s.w  %[randTemp3],   %[randTemp3]                   \n\t"
+      "cvt.s.w  %[randTemp4],   %[randTemp4]                   \n\t"
+      "addiu    %[randWptr],    %[randWptr],      8            \n\t"
+      "mul.s    %[randTemp],    %[randTemp],      %[pi2t]      \n\t"
+      "mul.s    %[randTemp2],   %[randTemp2],     %[pi2t]      \n\t"
+      "mul.s    %[randTemp3],   %[randTemp3],     %[pi2t]      \n\t"
+      "mul.s    %[randTemp4],   %[randTemp4],     %[pi2t]      \n\t"
+      ".set     pop                                            \n\t"
+      : [randWptr] "+r" (randWptr), [randTemp] "=&f" (randTemp),
+        [randTemp2] "=&f" (randTemp2), [randTemp3] "=&f" (randTemp3),
+        [randTemp4] "=&f" (randTemp4), [tmp1s] "=&r" (tmp1s),
+        [tmp2s] "=&r" (tmp2s), [tmp3s] "=&r" (tmp3s),
+        [tmp4s] "=&r" (tmp4s)
+      : [pi2t] "f" (pi2t)
+      : "memory"
+    );
+
+    u[i+1][0] = cosf(randTemp);
+    u[i+1][1] = sinf(randTemp);
+    u[i+2][0] = cosf(randTemp2);
+    u[i+2][1] = sinf(randTemp2);
+    u[i+3][0] = cosf(randTemp3);
+    u[i+3][1] = sinf(randTemp3);
+    u[i+4][0] = cosf(randTemp4);
+    u[i+4][1] = sinf(randTemp4);
+  }
+
+  // Reject LF noise
+  float* u_ptr = &u[1][0];
+  float noise2, noise3, noise4;
+  float tmp1f, tmp2f, tmp3f, tmp4f, tmp5f, tmp6f, tmp7f, tmp8f;
+
+  u[0][0] = 0;
+  u[0][1] = 0;
+  for (i = 1; i < PART_LEN1; i+=4) {
+    __asm __volatile (
+      ".set     push                                            \n\t"
+      ".set     noreorder                                       \n\t"
+      "lwc1     %[noise],       4(%[noisePow])                  \n\t"
+      "lwc1     %[noise2],      8(%[noisePow])                  \n\t"
+      "lwc1     %[noise3],      12(%[noisePow])                 \n\t"
+      "lwc1     %[noise4],      16(%[noisePow])                 \n\t"
+      "sqrt.s   %[noise],       %[noise]                        \n\t"
+      "sqrt.s   %[noise2],      %[noise2]                       \n\t"
+      "sqrt.s   %[noise3],      %[noise3]                       \n\t"
+      "sqrt.s   %[noise4],      %[noise4]                       \n\t"
+      "lwc1     %[tmp1f],       0(%[u_ptr])                     \n\t"
+      "lwc1     %[tmp2f],       4(%[u_ptr])                     \n\t"
+      "lwc1     %[tmp3f],       8(%[u_ptr])                     \n\t"
+      "lwc1     %[tmp4f],       12(%[u_ptr])                    \n\t"
+      "lwc1     %[tmp5f],       16(%[u_ptr])                    \n\t"
+      "lwc1     %[tmp6f],       20(%[u_ptr])                    \n\t"
+      "lwc1     %[tmp7f],       24(%[u_ptr])                    \n\t"
+      "lwc1     %[tmp8f],       28(%[u_ptr])                    \n\t"
+      "addiu    %[noisePow],    %[noisePow],      16            \n\t"
+      "mul.s    %[tmp1f],       %[tmp1f],         %[noise]      \n\t"
+      "mul.s    %[tmp2f],       %[tmp2f],         %[noise]      \n\t"
+      "mul.s    %[tmp3f],       %[tmp3f],         %[noise2]     \n\t"
+      "mul.s    %[tmp4f],       %[tmp4f],         %[noise2]     \n\t"
+      "mul.s    %[tmp5f],       %[tmp5f],         %[noise3]     \n\t"
+      "mul.s    %[tmp6f],       %[tmp6f],         %[noise3]     \n\t"
+      "swc1     %[tmp1f],       0(%[u_ptr])                     \n\t"
+      "swc1     %[tmp3f],       8(%[u_ptr])                     \n\t"
+      "mul.s    %[tmp8f],       %[tmp8f],         %[noise4]     \n\t"
+      "mul.s    %[tmp7f],       %[tmp7f],         %[noise4]     \n\t"
+      "neg.s    %[tmp2f]                                        \n\t"
+      "neg.s    %[tmp4f]                                        \n\t"
+      "neg.s    %[tmp6f]                                        \n\t"
+      "neg.s    %[tmp8f]                                        \n\t"
+      "swc1     %[tmp5f],       16(%[u_ptr])                    \n\t"
+      "swc1     %[tmp7f],       24(%[u_ptr])                    \n\t"
+      "swc1     %[tmp2f],       4(%[u_ptr])                     \n\t"
+      "swc1     %[tmp4f],       12(%[u_ptr])                    \n\t"
+      "swc1     %[tmp6f],       20(%[u_ptr])                    \n\t"
+      "swc1     %[tmp8f],       28(%[u_ptr])                    \n\t"
+      "addiu    %[u_ptr],       %[u_ptr],         32            \n\t"
+      ".set     pop                                             \n\t"
+      : [u_ptr] "+r" (u_ptr),  [noisePow] "+r" (noisePow),
+        [noise] "=&f" (noise), [noise2] "=&f" (noise2),
+        [noise3] "=&f" (noise3), [noise4] "=&f" (noise4),
+        [tmp1f] "=&f" (tmp1f), [tmp2f] "=&f" (tmp2f),
+        [tmp3f] "=&f" (tmp3f), [tmp4f] "=&f" (tmp4f),
+        [tmp5f] "=&f" (tmp5f), [tmp6f] "=&f" (tmp6f),
+        [tmp7f] "=&f" (tmp7f), [tmp8f] "=&f" (tmp8f)
+      :
+      : "memory"
+    );
+  }
+  u[PART_LEN][1] = 0;
+  noisePow -= PART_LEN;
+
+  u_ptr = &u[0][0];
+  float* u_ptr_end = &u[PART_LEN][0];
+  float* efw_ptr_0 = &efw[0][0];
+  float* efw_ptr_1 = &efw[1][0];
+  float tmp9f, tmp10f;
+  const float tmp1c = 1.0;
+
+  __asm __volatile (
+    ".set     push                                                        \n\t"
+    ".set     noreorder                                                   \n\t"
+   "1:                                                                    \n\t"
+    "lwc1     %[tmp1f],       0(%[lambda])                                \n\t"
+    "lwc1     %[tmp6f],       4(%[lambda])                                \n\t"
+    "addiu    %[lambda],      %[lambda],        8                         \n\t"
+    "c.lt.s   %[tmp1f],       %[tmp1c]                                    \n\t"
+    "bc1f     4f                                                          \n\t"
+    " nop                                                                 \n\t"
+    "c.lt.s   %[tmp6f],       %[tmp1c]                                    \n\t"
+    "bc1f     3f                                                          \n\t"
+    " nop                                                                 \n\t"
+   "2:                                                                    \n\t"
+    "mul.s    %[tmp1f],       %[tmp1f],         %[tmp1f]                  \n\t"
+    "mul.s    %[tmp6f],       %[tmp6f],         %[tmp6f]                  \n\t"
+    "sub.s    %[tmp1f],       %[tmp1c],         %[tmp1f]                  \n\t"
+    "sub.s    %[tmp6f],       %[tmp1c],         %[tmp6f]                  \n\t"
+    "sqrt.s   %[tmp1f],       %[tmp1f]                                    \n\t"
+    "sqrt.s   %[tmp6f],       %[tmp6f]                                    \n\t"
+    "lwc1     %[tmp2f],       0(%[efw_ptr_0])                             \n\t"
+    "lwc1     %[tmp3f],       0(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp7f],       4(%[efw_ptr_0])                             \n\t"
+    "lwc1     %[tmp8f],       8(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp4f],       0(%[efw_ptr_1])                             \n\t"
+    "lwc1     %[tmp5f],       4(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp9f],       4(%[efw_ptr_1])                             \n\t"
+    "lwc1     %[tmp10f],      12(%[u_ptr])                                \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s    %[tmp3f],       %[tmp1f],         %[tmp3f]                  \n\t"
+    "add.s    %[tmp2f],       %[tmp2f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp1f],         %[tmp5f]                  \n\t"
+    "add.s    %[tmp4f],       %[tmp4f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp6f],         %[tmp8f]                  \n\t"
+    "add.s    %[tmp7f],       %[tmp7f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp6f],         %[tmp10f]                 \n\t"
+    "add.s    %[tmp9f],       %[tmp9f],         %[tmp3f]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "madd.s   %[tmp2f],       %[tmp2f],         %[tmp1f],     %[tmp3f]    \n\t"
+    "madd.s   %[tmp4f],       %[tmp4f],         %[tmp1f],     %[tmp5f]    \n\t"
+    "madd.s   %[tmp7f],       %[tmp7f],         %[tmp6f],     %[tmp8f]    \n\t"
+    "madd.s   %[tmp9f],       %[tmp9f],         %[tmp6f],     %[tmp10f]   \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "swc1     %[tmp2f],       0(%[efw_ptr_0])                             \n\t"
+    "swc1     %[tmp4f],       0(%[efw_ptr_1])                             \n\t"
+    "swc1     %[tmp7f],       4(%[efw_ptr_0])                             \n\t"
+    "b        5f                                                          \n\t"
+    " swc1    %[tmp9f],       4(%[efw_ptr_1])                             \n\t"
+   "3:                                                                    \n\t"
+    "mul.s    %[tmp1f],       %[tmp1f],         %[tmp1f]                  \n\t"
+    "sub.s    %[tmp1f],       %[tmp1c],         %[tmp1f]                  \n\t"
+    "sqrt.s   %[tmp1f],       %[tmp1f]                                    \n\t"
+    "lwc1     %[tmp2f],       0(%[efw_ptr_0])                             \n\t"
+    "lwc1     %[tmp3f],       0(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp4f],       0(%[efw_ptr_1])                             \n\t"
+    "lwc1     %[tmp5f],       4(%[u_ptr])                                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s    %[tmp3f],       %[tmp1f],         %[tmp3f]                  \n\t"
+    "add.s    %[tmp2f],       %[tmp2f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp1f],         %[tmp5f]                  \n\t"
+    "add.s    %[tmp4f],       %[tmp4f],         %[tmp3f]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "madd.s   %[tmp2f],       %[tmp2f],         %[tmp1f],     %[tmp3f]    \n\t"
+    "madd.s   %[tmp4f],       %[tmp4f],         %[tmp1f],     %[tmp5f]    \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "swc1     %[tmp2f],       0(%[efw_ptr_0])                             \n\t"
+    "b        5f                                                          \n\t"
+    " swc1    %[tmp4f],       0(%[efw_ptr_1])                             \n\t"
+   "4:                                                                    \n\t"
+    "c.lt.s   %[tmp6f],       %[tmp1c]                                    \n\t"
+    "bc1f     5f                                                          \n\t"
+    " nop                                                                 \n\t"
+    "mul.s    %[tmp6f],       %[tmp6f],         %[tmp6f]                  \n\t"
+    "sub.s    %[tmp6f],       %[tmp1c],         %[tmp6f]                  \n\t"
+    "sqrt.s   %[tmp6f],       %[tmp6f]                                    \n\t"
+    "lwc1     %[tmp7f],       4(%[efw_ptr_0])                             \n\t"
+    "lwc1     %[tmp8f],       8(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp9f],       4(%[efw_ptr_1])                             \n\t"
+    "lwc1     %[tmp10f],      12(%[u_ptr])                                \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s    %[tmp3f],       %[tmp6f],         %[tmp8f]                  \n\t"
+    "add.s    %[tmp7f],       %[tmp7f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp6f],         %[tmp10f]                 \n\t"
+    "add.s    %[tmp9f],       %[tmp9f],         %[tmp3f]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "madd.s   %[tmp7f],       %[tmp7f],         %[tmp6f],     %[tmp8f]    \n\t"
+    "madd.s   %[tmp9f],       %[tmp9f],         %[tmp6f],     %[tmp10f]   \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "swc1     %[tmp7f],       4(%[efw_ptr_0])                             \n\t"
+    "swc1     %[tmp9f],       4(%[efw_ptr_1])                             \n\t"
+   "5:                                                                    \n\t"
+    "addiu    %[u_ptr],       %[u_ptr],         16                        \n\t"
+    "addiu    %[efw_ptr_0],   %[efw_ptr_0],     8                         \n\t"
+    "bne      %[u_ptr],       %[u_ptr_end],     1b                        \n\t"
+    " addiu   %[efw_ptr_1],   %[efw_ptr_1],     8                         \n\t"
+    ".set     pop                                                         \n\t"
+    : [lambda] "+r" (lambda), [u_ptr] "+r" (u_ptr),
+      [efw_ptr_0] "+r" (efw_ptr_0), [efw_ptr_1] "+r" (efw_ptr_1),
+      [tmp1f] "=&f" (tmp1f), [tmp2f] "=&f" (tmp2f), [tmp3f] "=&f" (tmp3f),
+      [tmp4f] "=&f" (tmp4f), [tmp5f] "=&f" (tmp5f),
+      [tmp6f] "=&f" (tmp6f), [tmp7f] "=&f" (tmp7f), [tmp8f] "=&f" (tmp8f),
+      [tmp9f] "=&f" (tmp9f), [tmp10f] "=&f" (tmp10f)
+    : [tmp1c] "f" (tmp1c), [u_ptr_end] "r" (u_ptr_end)
+    : "memory"
+  );
+
+  lambda -= PART_LEN;
+  tmp = sqrtf(WEBRTC_SPL_MAX(1 - lambda[PART_LEN] * lambda[PART_LEN], 0));
+  //tmp = 1 - lambda[i];
+  efw[0][PART_LEN] += tmp * u[PART_LEN][0];
+  efw[1][PART_LEN] += tmp * u[PART_LEN][1];
+
+  // For H band comfort noise
+  // TODO: don't compute noise and "tmp" twice. Use the previous results.
+  noiseAvg = 0.0;
+  tmpAvg = 0.0;
+  num = 0;
+  if ((aec->sampFreq == 32000 || aec->sampFreq == 48000) && flagHbandCn == 1) {
+    for (i = 0; i < PART_LEN; i++) {
+      rand[i] = ((float)randW16[i]) / 32768;
+    }
+
+    // average noise scale
+    // average over second half of freq spectrum (i.e., 4->8khz)
+    // TODO: we shouldn't need num. We know how many elements we're summing.
+    for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
+      num++;
+      noiseAvg += sqrtf(noisePow[i]);
+    }
+    noiseAvg /= (float)num;
+
+    // average nlp scale
+    // average over second half of freq spectrum (i.e., 4->8khz)
+    // TODO: we shouldn't need num. We know how many elements we're summing.
+    num = 0;
+    for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
+      num++;
+      tmpAvg += sqrtf(WEBRTC_SPL_MAX(1 - lambda[i] * lambda[i], 0));
+    }
+    tmpAvg /= (float)num;
+
+    // Use average noise for H band
+    // TODO: we should probably have a new random vector here.
+    // Reject LF noise
+    u[0][0] = 0;
+    u[0][1] = 0;
+    for (i = 1; i < PART_LEN1; i++) {
+      tmp = pi2 * rand[i - 1];
+
+      // Use average noise for H band
+      u[i][0] = noiseAvg * (float)cos(tmp);
+      u[i][1] = -noiseAvg * (float)sin(tmp);
+    }
+    u[PART_LEN][1] = 0;
+
+    for (i = 0; i < PART_LEN1; i++) {
+      // Use average NLP weight for H band
+      comfortNoiseHband[i][0] = tmpAvg * u[i][0];
+      comfortNoiseHband[i][1] = tmpAvg * u[i][1];
+    }
+  }
+}
+
+void WebRtcAec_FilterFar_mips(AecCore* aec, float yf[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < aec->num_partitions; i++) {
+    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
+    int pos = i * PART_LEN1;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >=  aec->num_partitions) {
+      xPos -=  aec->num_partitions * (PART_LEN1);
+    }
+    float* yf0 = yf[0];
+    float* yf1 = yf[1];
+    float* aRe = aec->xfBuf[0] + xPos;
+    float* aIm = aec->xfBuf[1] + xPos;
+    float* bRe = aec->wfBuf[0] + pos;
+    float* bIm = aec->wfBuf[1] + pos;
+    float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13;
+    int len = PART_LEN1 >> 1;
+
+    __asm __volatile (
+      ".set       push                                                \n\t"
+      ".set       noreorder                                           \n\t"
+     "1:                                                              \n\t"
+      "lwc1       %[f0],      0(%[aRe])                               \n\t"
+      "lwc1       %[f1],      0(%[bRe])                               \n\t"
+      "lwc1       %[f2],      0(%[bIm])                               \n\t"
+      "lwc1       %[f3],      0(%[aIm])                               \n\t"
+      "lwc1       %[f4],      4(%[aRe])                               \n\t"
+      "lwc1       %[f5],      4(%[bRe])                               \n\t"
+      "lwc1       %[f6],      4(%[bIm])                               \n\t"
+      "mul.s      %[f8],      %[f0],          %[f1]                   \n\t"
+      "mul.s      %[f0],      %[f0],          %[f2]                   \n\t"
+      "mul.s      %[f9],      %[f4],          %[f5]                   \n\t"
+      "mul.s      %[f4],      %[f4],          %[f6]                   \n\t"
+      "lwc1       %[f7],      4(%[aIm])                               \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s      %[f12],     %[f2],          %[f3]                   \n\t"
+      "mul.s      %[f1],      %[f3],          %[f1]                   \n\t"
+      "mul.s      %[f11],     %[f6],          %[f7]                   \n\t"
+      "addiu      %[aRe],     %[aRe],         8                       \n\t"
+      "addiu      %[aIm],     %[aIm],         8                       \n\t"
+      "addiu      %[len],     %[len],         -1                      \n\t"
+      "sub.s      %[f8],      %[f8],          %[f12]                  \n\t"
+      "mul.s      %[f12],     %[f7],          %[f5]                   \n\t"
+      "lwc1       %[f2],      0(%[yf0])                               \n\t"
+      "add.s      %[f1],      %[f0],          %[f1]                   \n\t"
+      "lwc1       %[f3],      0(%[yf1])                               \n\t"
+      "sub.s      %[f9],      %[f9],          %[f11]                  \n\t"
+      "lwc1       %[f6],      4(%[yf0])                               \n\t"
+      "add.s      %[f4],      %[f4],          %[f12]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "addiu      %[aRe],     %[aRe],         8                       \n\t"
+      "addiu      %[aIm],     %[aIm],         8                       \n\t"
+      "addiu      %[len],     %[len],         -1                      \n\t"
+      "nmsub.s    %[f8],      %[f8],          %[f2],      %[f3]       \n\t"
+      "lwc1       %[f2],      0(%[yf0])                               \n\t"
+      "madd.s     %[f1],      %[f0],          %[f3],      %[f1]       \n\t"
+      "lwc1       %[f3],      0(%[yf1])                               \n\t"
+      "nmsub.s    %[f9],      %[f9],          %[f6],      %[f7]       \n\t"
+      "lwc1       %[f6],      4(%[yf0])                               \n\t"
+      "madd.s     %[f4],      %[f4],          %[f7],      %[f5]       \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "lwc1       %[f5],      4(%[yf1])                               \n\t"
+      "add.s      %[f2],      %[f2],          %[f8]                   \n\t"
+      "addiu      %[bRe],     %[bRe],         8                       \n\t"
+      "addiu      %[bIm],     %[bIm],         8                       \n\t"
+      "add.s      %[f3],      %[f3],          %[f1]                   \n\t"
+      "add.s      %[f6],      %[f6],          %[f9]                   \n\t"
+      "add.s      %[f5],      %[f5],          %[f4]                   \n\t"
+      "swc1       %[f2],      0(%[yf0])                               \n\t"
+      "swc1       %[f3],      0(%[yf1])                               \n\t"
+      "swc1       %[f6],      4(%[yf0])                               \n\t"
+      "swc1       %[f5],      4(%[yf1])                               \n\t"
+      "addiu      %[yf0],     %[yf0],         8                       \n\t"
+      "bgtz       %[len],     1b                                      \n\t"
+      " addiu     %[yf1],     %[yf1],         8                       \n\t"
+      "lwc1       %[f0],      0(%[aRe])                               \n\t"
+      "lwc1       %[f1],      0(%[bRe])                               \n\t"
+      "lwc1       %[f2],      0(%[bIm])                               \n\t"
+      "lwc1       %[f3],      0(%[aIm])                               \n\t"
+      "mul.s      %[f8],      %[f0],          %[f1]                   \n\t"
+      "mul.s      %[f0],      %[f0],          %[f2]                   \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s      %[f12],     %[f2],          %[f3]                   \n\t"
+      "mul.s      %[f1],      %[f3],          %[f1]                   \n\t"
+      "sub.s      %[f8],      %[f8],          %[f12]                  \n\t"
+      "lwc1       %[f2],      0(%[yf0])                               \n\t"
+      "add.s      %[f1],      %[f0],          %[f1]                   \n\t"
+      "lwc1       %[f3],      0(%[yf1])                               \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "nmsub.s    %[f8],      %[f8],          %[f2],      %[f3]       \n\t"
+      "lwc1       %[f2],      0(%[yf0])                               \n\t"
+      "madd.s     %[f1],      %[f0],          %[f3],      %[f1]       \n\t"
+      "lwc1       %[f3],      0(%[yf1])                               \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "add.s      %[f2],      %[f2],          %[f8]                   \n\t"
+      "add.s      %[f3],      %[f3],          %[f1]                   \n\t"
+      "swc1       %[f2],      0(%[yf0])                               \n\t"
+      "swc1       %[f3],      0(%[yf1])                               \n\t"
+      ".set       pop                                                 \n\t"
+      : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+        [f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
+        [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
+        [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
+        [f12] "=&f" (f12), [f13] "=&f" (f13), [aRe] "+r" (aRe),
+        [aIm] "+r" (aIm), [bRe] "+r" (bRe), [bIm] "+r" (bIm),
+        [yf0] "+r" (yf0), [yf1] "+r" (yf1), [len] "+r" (len)
+      :
+      : "memory"
+    );
+  }
+}
+
+void WebRtcAec_FilterAdaptation_mips(AecCore* aec,
+                                     float* fft,
+                                     float ef[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < aec->num_partitions; i++) {
+    int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
+    int pos;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= aec->num_partitions) {
+      xPos -= aec->num_partitions * PART_LEN1;
+    }
+
+    pos = i * PART_LEN1;
+    float* aRe = aec->xfBuf[0] + xPos;
+    float* aIm = aec->xfBuf[1] + xPos;
+    float* bRe = ef[0];
+    float* bIm = ef[1];
+    float* fft_tmp;
+
+    float f0, f1, f2, f3, f4, f5, f6 ,f7, f8, f9, f10, f11, f12;
+    int len = PART_LEN >> 1;
+
+    __asm __volatile (
+      ".set       push                                                \n\t"
+      ".set       noreorder                                           \n\t"
+      "addiu      %[fft_tmp], %[fft],         0                       \n\t"
+     "1:                                                              \n\t"
+      "lwc1       %[f0],      0(%[aRe])                               \n\t"
+      "lwc1       %[f1],      0(%[bRe])                               \n\t"
+      "lwc1       %[f2],      0(%[bIm])                               \n\t"
+      "lwc1       %[f4],      4(%[aRe])                               \n\t"
+      "lwc1       %[f5],      4(%[bRe])                               \n\t"
+      "lwc1       %[f6],      4(%[bIm])                               \n\t"
+      "addiu      %[aRe],     %[aRe],         8                       \n\t"
+      "addiu      %[bRe],     %[bRe],         8                       \n\t"
+      "mul.s      %[f8],      %[f0],          %[f1]                   \n\t"
+      "mul.s      %[f0],      %[f0],          %[f2]                   \n\t"
+      "lwc1       %[f3],      0(%[aIm])                               \n\t"
+      "mul.s      %[f9],      %[f4],          %[f5]                   \n\t"
+      "lwc1       %[f7],      4(%[aIm])                               \n\t"
+      "mul.s      %[f4],      %[f4],          %[f6]                   \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s      %[f10],     %[f3],          %[f2]                   \n\t"
+      "mul.s      %[f1],      %[f3],          %[f1]                   \n\t"
+      "mul.s      %[f11],     %[f7],          %[f6]                   \n\t"
+      "mul.s      %[f5],      %[f7],          %[f5]                   \n\t"
+      "addiu      %[aIm],     %[aIm],         8                       \n\t"
+      "addiu      %[bIm],     %[bIm],         8                       \n\t"
+      "addiu      %[len],     %[len],         -1                      \n\t"
+      "add.s      %[f8],      %[f8],          %[f10]                  \n\t"
+      "sub.s      %[f1],      %[f0],          %[f1]                   \n\t"
+      "add.s      %[f9],      %[f9],          %[f11]                  \n\t"
+      "sub.s      %[f5],      %[f4],          %[f5]                   \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "addiu      %[aIm],     %[aIm],         8                       \n\t"
+      "addiu      %[bIm],     %[bIm],         8                       \n\t"
+      "addiu      %[len],     %[len],         -1                      \n\t"
+      "madd.s     %[f8],      %[f8],          %[f3],      %[f2]       \n\t"
+      "nmsub.s    %[f1],      %[f0],          %[f3],      %[f1]       \n\t"
+      "madd.s     %[f9],      %[f9],          %[f7],      %[f6]       \n\t"
+      "nmsub.s    %[f5],      %[f4],          %[f7],      %[f5]       \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "swc1       %[f8],      0(%[fft_tmp])                           \n\t"
+      "swc1       %[f1],      4(%[fft_tmp])                           \n\t"
+      "swc1       %[f9],      8(%[fft_tmp])                           \n\t"
+      "swc1       %[f5],      12(%[fft_tmp])                          \n\t"
+      "bgtz       %[len],     1b                                      \n\t"
+      " addiu     %[fft_tmp], %[fft_tmp],     16                      \n\t"
+      "lwc1       %[f0],      0(%[aRe])                               \n\t"
+      "lwc1       %[f1],      0(%[bRe])                               \n\t"
+      "lwc1       %[f2],      0(%[bIm])                               \n\t"
+      "lwc1       %[f3],      0(%[aIm])                               \n\t"
+      "mul.s      %[f8],      %[f0],          %[f1]                   \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s      %[f10],     %[f3],          %[f2]                   \n\t"
+      "add.s      %[f8],      %[f8],          %[f10]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "madd.s     %[f8],      %[f8],          %[f3],      %[f2]       \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "swc1       %[f8],      4(%[fft])                               \n\t"
+      ".set       pop                                                 \n\t"
+      : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+        [f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
+        [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
+        [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
+        [f12] "=&f" (f12), [aRe] "+r" (aRe), [aIm] "+r" (aIm),
+        [bRe] "+r" (bRe), [bIm] "+r" (bIm), [fft_tmp] "=&r" (fft_tmp),
+        [len] "+r" (len)
+      : [fft] "r" (fft)
+      : "memory"
+    );
+
+    aec_rdft_inverse_128(fft);
+    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
+
+    // fft scaling
+    {
+      float scale = 2.0f / PART_LEN2;
+      __asm __volatile (
+        ".set     push                                    \n\t"
+        ".set     noreorder                               \n\t"
+        "addiu    %[fft_tmp], %[fft],        0            \n\t"
+        "addiu    %[len],     $zero,         8            \n\t"
+       "1:                                                \n\t"
+        "addiu    %[len],     %[len],        -1           \n\t"
+        "lwc1     %[f0],      0(%[fft_tmp])               \n\t"
+        "lwc1     %[f1],      4(%[fft_tmp])               \n\t"
+        "lwc1     %[f2],      8(%[fft_tmp])               \n\t"
+        "lwc1     %[f3],      12(%[fft_tmp])              \n\t"
+        "mul.s    %[f0],      %[f0],         %[scale]     \n\t"
+        "mul.s    %[f1],      %[f1],         %[scale]     \n\t"
+        "mul.s    %[f2],      %[f2],         %[scale]     \n\t"
+        "mul.s    %[f3],      %[f3],         %[scale]     \n\t"
+        "lwc1     %[f4],      16(%[fft_tmp])              \n\t"
+        "lwc1     %[f5],      20(%[fft_tmp])              \n\t"
+        "lwc1     %[f6],      24(%[fft_tmp])              \n\t"
+        "lwc1     %[f7],      28(%[fft_tmp])              \n\t"
+        "mul.s    %[f4],      %[f4],         %[scale]     \n\t"
+        "mul.s    %[f5],      %[f5],         %[scale]     \n\t"
+        "mul.s    %[f6],      %[f6],         %[scale]     \n\t"
+        "mul.s    %[f7],      %[f7],         %[scale]     \n\t"
+        "swc1     %[f0],      0(%[fft_tmp])               \n\t"
+        "swc1     %[f1],      4(%[fft_tmp])               \n\t"
+        "swc1     %[f2],      8(%[fft_tmp])               \n\t"
+        "swc1     %[f3],      12(%[fft_tmp])              \n\t"
+        "swc1     %[f4],      16(%[fft_tmp])              \n\t"
+        "swc1     %[f5],      20(%[fft_tmp])              \n\t"
+        "swc1     %[f6],      24(%[fft_tmp])              \n\t"
+        "swc1     %[f7],      28(%[fft_tmp])              \n\t"
+        "bgtz     %[len],     1b                          \n\t"
+        " addiu   %[fft_tmp], %[fft_tmp],    32           \n\t"
+        ".set     pop                                     \n\t"
+        : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+          [f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
+          [f6] "=&f" (f6), [f7] "=&f" (f7), [len] "=&r" (len),
+          [fft_tmp] "=&r" (fft_tmp)
+        : [scale] "f" (scale), [fft] "r" (fft)
+        : "memory"
+      );
+    }
+    aec_rdft_forward_128(fft);
+    aRe = aec->wfBuf[0] + pos;
+    aIm = aec->wfBuf[1] + pos;
+    __asm __volatile (
+      ".set     push                                    \n\t"
+      ".set     noreorder                               \n\t"
+      "addiu    %[fft_tmp], %[fft],        0            \n\t"
+      "addiu    %[len],     $zero,         31           \n\t"
+      "lwc1     %[f0],      0(%[aRe])                   \n\t"
+      "lwc1     %[f1],      0(%[fft_tmp])               \n\t"
+      "lwc1     %[f2],      256(%[aRe])                 \n\t"
+      "lwc1     %[f3],      4(%[fft_tmp])               \n\t"
+      "lwc1     %[f4],      4(%[aRe])                   \n\t"
+      "lwc1     %[f5],      8(%[fft_tmp])               \n\t"
+      "lwc1     %[f6],      4(%[aIm])                   \n\t"
+      "lwc1     %[f7],      12(%[fft_tmp])              \n\t"
+      "add.s    %[f0],      %[f0],         %[f1]        \n\t"
+      "add.s    %[f2],      %[f2],         %[f3]        \n\t"
+      "add.s    %[f4],      %[f4],         %[f5]        \n\t"
+      "add.s    %[f6],      %[f6],         %[f7]        \n\t"
+      "addiu    %[fft_tmp], %[fft_tmp],    16           \n\t"
+      "swc1     %[f0],      0(%[aRe])                   \n\t"
+      "swc1     %[f2],      256(%[aRe])                 \n\t"
+      "swc1     %[f4],      4(%[aRe])                   \n\t"
+      "addiu    %[aRe],     %[aRe],        8            \n\t"
+      "swc1     %[f6],      4(%[aIm])                   \n\t"
+      "addiu    %[aIm],     %[aIm],        8            \n\t"
+     "1:                                                \n\t"
+      "lwc1     %[f0],      0(%[aRe])                   \n\t"
+      "lwc1     %[f1],      0(%[fft_tmp])               \n\t"
+      "lwc1     %[f2],      0(%[aIm])                   \n\t"
+      "lwc1     %[f3],      4(%[fft_tmp])               \n\t"
+      "lwc1     %[f4],      4(%[aRe])                   \n\t"
+      "lwc1     %[f5],      8(%[fft_tmp])               \n\t"
+      "lwc1     %[f6],      4(%[aIm])                   \n\t"
+      "lwc1     %[f7],      12(%[fft_tmp])              \n\t"
+      "add.s    %[f0],      %[f0],         %[f1]        \n\t"
+      "add.s    %[f2],      %[f2],         %[f3]        \n\t"
+      "add.s    %[f4],      %[f4],         %[f5]        \n\t"
+      "add.s    %[f6],      %[f6],         %[f7]        \n\t"
+      "addiu    %[len],     %[len],        -1           \n\t"
+      "addiu    %[fft_tmp], %[fft_tmp],    16           \n\t"
+      "swc1     %[f0],      0(%[aRe])                   \n\t"
+      "swc1     %[f2],      0(%[aIm])                   \n\t"
+      "swc1     %[f4],      4(%[aRe])                   \n\t"
+      "addiu    %[aRe],     %[aRe],        8            \n\t"
+      "swc1     %[f6],      4(%[aIm])                   \n\t"
+      "bgtz     %[len],     1b                          \n\t"
+      " addiu   %[aIm],     %[aIm],        8            \n\t"
+      ".set     pop                                     \n\t"
+      : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+        [f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
+        [f6] "=&f" (f6), [f7] "=&f" (f7), [len] "=&r" (len),
+        [fft_tmp] "=&r" (fft_tmp), [aRe] "+r" (aRe), [aIm] "+r" (aIm)
+      : [fft] "r" (fft)
+      : "memory"
+    );
+  }
+}
+
+void WebRtcAec_OverdriveAndSuppress_mips(AecCore* aec,
+                                         float hNl[PART_LEN1],
+                                         const float hNlFb,
+                                         float efw[2][PART_LEN1]) {
+  int i;
+  const float one = 1.0;
+  float* p_hNl;
+  float* p_efw0;
+  float* p_efw1;
+  float* p_WebRtcAec_wC;
+  float temp1, temp2, temp3, temp4;
+
+  p_hNl = &hNl[0];
+  p_efw0 = &efw[0][0];
+  p_efw1 = &efw[1][0];
+  p_WebRtcAec_wC = (float*)&WebRtcAec_weightCurve[0];
+
+  for (i = 0; i < PART_LEN1; i++) {
+    // Weight subbands
+    __asm __volatile (
+      ".set      push                                              \n\t"
+      ".set      noreorder                                         \n\t"
+      "lwc1      %[temp1],    0(%[p_hNl])                          \n\t"
+      "lwc1      %[temp2],    0(%[p_wC])                           \n\t"
+      "c.lt.s    %[hNlFb],    %[temp1]                             \n\t"
+      "bc1f      1f                                                \n\t"
+      " mul.s    %[temp3],    %[temp2],     %[hNlFb]               \n\t"
+      "sub.s     %[temp4],    %[one],       %[temp2]               \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s     %[temp1],    %[temp1],     %[temp4]               \n\t"
+      "add.s     %[temp1],    %[temp3],     %[temp1]               \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "madd.s    %[temp1],    %[temp3],     %[temp1],   %[temp4]   \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "swc1      %[temp1],    0(%[p_hNl])                          \n\t"
+     "1:                                                           \n\t"
+      "addiu     %[p_wC],     %[p_wC],      4                      \n\t"
+      ".set      pop                                               \n\t"
+      : [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), [temp3] "=&f" (temp3),
+        [temp4] "=&f" (temp4), [p_wC] "+r" (p_WebRtcAec_wC)
+      : [hNlFb] "f" (hNlFb), [one] "f" (one), [p_hNl] "r" (p_hNl)
+      : "memory"
+    );
+
+    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
+
+    __asm __volatile (
+      "lwc1      %[temp1],    0(%[p_hNl])              \n\t"
+      "lwc1      %[temp3],    0(%[p_efw1])             \n\t"
+      "lwc1      %[temp2],    0(%[p_efw0])             \n\t"
+      "addiu     %[p_hNl],    %[p_hNl],     4          \n\t"
+      "mul.s     %[temp3],    %[temp3],     %[temp1]   \n\t"
+      "mul.s     %[temp2],    %[temp2],     %[temp1]   \n\t"
+      "addiu     %[p_efw0],   %[p_efw0],    4          \n\t"
+      "addiu     %[p_efw1],   %[p_efw1],    4          \n\t"
+      "neg.s     %[temp4],    %[temp3]                 \n\t"
+      "swc1      %[temp2],    -4(%[p_efw0])            \n\t"
+      "swc1      %[temp4],    -4(%[p_efw1])            \n\t"
+      : [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), [temp3] "=&f" (temp3),
+        [temp4] "=&f" (temp4), [p_efw0] "+r" (p_efw0), [p_efw1] "+r" (p_efw1),
+        [p_hNl] "+r" (p_hNl)
+      :
+      : "memory"
+    );
+  }
+}
+
+void WebRtcAec_ScaleErrorSignal_mips(AecCore* aec, float ef[2][PART_LEN1]) {
+  const float mu = aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
+  const float error_threshold = aec->extended_filter_enabled
+                                    ? kExtendedErrorThreshold
+                                    : aec->normal_error_threshold;
+  int len = (PART_LEN1);
+  float* ef0 = ef[0];
+  float* ef1 = ef[1];
+  float* xPow = aec->xPow;
+  float fac1 = 1e-10f;
+  float err_th2 = error_threshold * error_threshold;
+  float f0, f1, f2;
+#if !defined(MIPS32_R2_LE)
+  float f3;
+#endif
+
+  __asm __volatile (
+    ".set       push                                   \n\t"
+    ".set       noreorder                              \n\t"
+   "1:                                                 \n\t"
+    "lwc1       %[f0],     0(%[xPow])                  \n\t"
+    "lwc1       %[f1],     0(%[ef0])                   \n\t"
+    "lwc1       %[f2],     0(%[ef1])                   \n\t"
+    "add.s      %[f0],     %[f0],       %[fac1]        \n\t"
+    "div.s      %[f1],     %[f1],       %[f0]          \n\t"
+    "div.s      %[f2],     %[f2],       %[f0]          \n\t"
+    "mul.s      %[f0],     %[f1],       %[f1]          \n\t"
+#if defined(MIPS32_R2_LE)
+    "madd.s     %[f0],     %[f0],       %[f2],   %[f2] \n\t"
+#else
+    "mul.s      %[f3],     %[f2],       %[f2]          \n\t"
+    "add.s      %[f0],     %[f0],       %[f3]          \n\t"
+#endif
+    "c.le.s     %[f0],     %[err_th2]                  \n\t"
+    "nop                                               \n\t"
+    "bc1t       2f                                     \n\t"
+    " nop                                              \n\t"
+    "sqrt.s     %[f0],     %[f0]                       \n\t"
+    "add.s      %[f0],     %[f0],       %[fac1]        \n\t"
+    "div.s      %[f0],     %[err_th],   %[f0]          \n\t"
+    "mul.s      %[f1],     %[f1],       %[f0]          \n\t"
+    "mul.s      %[f2],     %[f2],       %[f0]          \n\t"
+   "2:                                                 \n\t"
+    "mul.s      %[f1],     %[f1],       %[mu]          \n\t"
+    "mul.s      %[f2],     %[f2],       %[mu]          \n\t"
+    "swc1       %[f1],     0(%[ef0])                   \n\t"
+    "swc1       %[f2],     0(%[ef1])                   \n\t"
+    "addiu      %[len],    %[len],      -1             \n\t"
+    "addiu      %[xPow],   %[xPow],     4              \n\t"
+    "addiu      %[ef0],    %[ef0],      4              \n\t"
+    "bgtz       %[len],    1b                          \n\t"
+    " addiu     %[ef1],    %[ef1],      4              \n\t"
+    ".set       pop                                    \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+#if !defined(MIPS32_R2_LE)
+      [f3] "=&f" (f3),
+#endif
+      [xPow] "+r" (xPow), [ef0] "+r" (ef0), [ef1] "+r" (ef1),
+      [len] "+r" (len)
+    : [fac1] "f" (fac1), [err_th2] "f" (err_th2), [mu] "f" (mu),
+      [err_th] "f" (error_threshold)
+    : "memory"
+  );
+}
+
+void WebRtcAec_InitAec_mips(void) {
+  WebRtcAec_FilterFar = WebRtcAec_FilterFar_mips;
+  WebRtcAec_FilterAdaptation = WebRtcAec_FilterAdaptation_mips;
+  WebRtcAec_ScaleErrorSignal = WebRtcAec_ScaleErrorSignal_mips;
+  WebRtcAec_ComfortNoise = WebRtcAec_ComfortNoise_mips;
+  WebRtcAec_OverdriveAndSuppress = WebRtcAec_OverdriveAndSuppress_mips;
+}
+
--- a/webrtc/modules/audio_processing/aec/aec_core_neon.c
+++ b/webrtc/modules/audio_processing/aec/aec_core_neon.c
@@ -0,0 +1,736 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * The core AEC algorithm, neon version of speed-critical functions.
+ *
+ * Based on aec_core_sse2.c.
+ */
+
+#include <arm_neon.h>
+#include <math.h>
+#include <string.h>  // memset
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/modules/audio_processing/aec/aec_common.h"
+#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+
+enum { kShiftExponentIntoTopMantissa = 8 };
+enum { kFloatExponentShift = 23 };
+
+__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
+  return aRe * bRe - aIm * bIm;
+}
+
+__inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
+  return aRe * bIm + aIm * bRe;
+}
+
+static void FilterFarNEON(AecCore* aec, float yf[2][PART_LEN1]) {
+  int i;
+  const int num_partitions = aec->num_partitions;
+  for (i = 0; i < num_partitions; i++) {
+    int j;
+    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
+    int pos = i * PART_LEN1;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= num_partitions) {
+      xPos -= num_partitions * PART_LEN1;
+    }
+
+    // vectorized code (four at once)
+    for (j = 0; j + 3 < PART_LEN1; j += 4) {
+      const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);
+      const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);
+      const float32x4_t wfBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);
+      const float32x4_t wfBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);
+      const float32x4_t yf_re = vld1q_f32(&yf[0][j]);
+      const float32x4_t yf_im = vld1q_f32(&yf[1][j]);
+      const float32x4_t a = vmulq_f32(xfBuf_re, wfBuf_re);
+      const float32x4_t e = vmlsq_f32(a, xfBuf_im, wfBuf_im);
+      const float32x4_t c = vmulq_f32(xfBuf_re, wfBuf_im);
+      const float32x4_t f = vmlaq_f32(c, xfBuf_im, wfBuf_re);
+      const float32x4_t g = vaddq_f32(yf_re, e);
+      const float32x4_t h = vaddq_f32(yf_im, f);
+      vst1q_f32(&yf[0][j], g);
+      vst1q_f32(&yf[1][j], h);
+    }
+    // scalar code for the remaining items.
+    for (; j < PART_LEN1; j++) {
+      yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],
+                        aec->xfBuf[1][xPos + j],
+                        aec->wfBuf[0][pos + j],
+                        aec->wfBuf[1][pos + j]);
+      yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],
+                        aec->xfBuf[1][xPos + j],
+                        aec->wfBuf[0][pos + j],
+                        aec->wfBuf[1][pos + j]);
+    }
+  }
+}
+
+// ARM64's arm_neon.h has already defined vdivq_f32 vsqrtq_f32.
+#if !defined (WEBRTC_ARCH_ARM64)
+static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) {
+  int i;
+  float32x4_t x = vrecpeq_f32(b);
+  // from arm documentation
+  // The Newton-Raphson iteration:
+  //     x[n+1] = x[n] * (2 - d * x[n])
+  // converges to (1/d) if x0 is the result of VRECPE applied to d.
+  //
+  // Note: The precision did not improve after 2 iterations.
+  for (i = 0; i < 2; i++) {
+    x = vmulq_f32(vrecpsq_f32(b, x), x);
+  }
+  // a/b = a*(1/b)
+  return vmulq_f32(a, x);
+}
+
+static float32x4_t vsqrtq_f32(float32x4_t s) {
+  int i;
+  float32x4_t x = vrsqrteq_f32(s);
+
+  // Code to handle sqrt(0).
+  // If the input to sqrtf() is zero, a zero will be returned.
+  // If the input to vrsqrteq_f32() is zero, positive infinity is returned.
+  const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);
+  // check for divide by zero
+  const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x));
+  // zero out the positive infinity results
+  x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero),
+                                      vreinterpretq_u32_f32(x)));
+  // from arm documentation
+  // The Newton-Raphson iteration:
+  //     x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2)
+  // converges to (1/√d) if x0 is the result of VRSQRTE applied to d.
+  //
+  // Note: The precision did not improve after 2 iterations.
+  for (i = 0; i < 2; i++) {
+    x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x);
+  }
+  // sqrt(s) = s * 1/sqrt(s)
+  return vmulq_f32(s, x);;
+}
+#endif  // WEBRTC_ARCH_ARM64
+
+static void ScaleErrorSignalNEON(AecCore* aec, float ef[2][PART_LEN1]) {
+  const float mu = aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
+  const float error_threshold = aec->extended_filter_enabled ?
+      kExtendedErrorThreshold : aec->normal_error_threshold;
+  const float32x4_t k1e_10f = vdupq_n_f32(1e-10f);
+  const float32x4_t kMu = vmovq_n_f32(mu);
+  const float32x4_t kThresh = vmovq_n_f32(error_threshold);
+  int i;
+  // vectorized code (four at once)
+  for (i = 0; i + 3 < PART_LEN1; i += 4) {
+    const float32x4_t xPow = vld1q_f32(&aec->xPow[i]);
+    const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]);
+    const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]);
+    const float32x4_t xPowPlus = vaddq_f32(xPow, k1e_10f);
+    float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus);
+    float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus);
+    const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re);
+    const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im);
+    const float32x4_t absEf = vsqrtq_f32(ef_sum2);
+    const uint32x4_t bigger = vcgtq_f32(absEf, kThresh);
+    const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f);
+    const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus);
+    uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv));
+    uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv));
+    uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger),
+                                     vreinterpretq_u32_f32(ef_re));
+    uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger),
+                                     vreinterpretq_u32_f32(ef_im));
+    ef_re_if = vandq_u32(bigger, ef_re_if);
+    ef_im_if = vandq_u32(bigger, ef_im_if);
+    ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if);
+    ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if);
+    ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu);
+    ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu);
+    vst1q_f32(&ef[0][i], ef_re);
+    vst1q_f32(&ef[1][i], ef_im);
+  }
+  // scalar code for the remaining items.
+  for (; i < PART_LEN1; i++) {
+    float abs_ef;
+    ef[0][i] /= (aec->xPow[i] + 1e-10f);
+    ef[1][i] /= (aec->xPow[i] + 1e-10f);
+    abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
+
+    if (abs_ef > error_threshold) {
+      abs_ef = error_threshold / (abs_ef + 1e-10f);
+      ef[0][i] *= abs_ef;
+      ef[1][i] *= abs_ef;
+    }
+
+    // Stepsize factor
+    ef[0][i] *= mu;
+    ef[1][i] *= mu;
+  }
+}
+
+static void FilterAdaptationNEON(AecCore* aec,
+                                 float* fft,
+                                 float ef[2][PART_LEN1]) {
+  int i;
+  const int num_partitions = aec->num_partitions;
+  for (i = 0; i < num_partitions; i++) {
+    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
+    int pos = i * PART_LEN1;
+    int j;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= num_partitions) {
+      xPos -= num_partitions * PART_LEN1;
+    }
+
+    // Process the whole array...
+    for (j = 0; j < PART_LEN; j += 4) {
+      // Load xfBuf and ef.
+      const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);
+      const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);
+      const float32x4_t ef_re = vld1q_f32(&ef[0][j]);
+      const float32x4_t ef_im = vld1q_f32(&ef[1][j]);
+      // Calculate the product of conjugate(xfBuf) by ef.
+      //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm
+      //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe
+      const float32x4_t a = vmulq_f32(xfBuf_re, ef_re);
+      const float32x4_t e = vmlaq_f32(a, xfBuf_im, ef_im);
+      const float32x4_t c = vmulq_f32(xfBuf_re, ef_im);
+      const float32x4_t f = vmlsq_f32(c, xfBuf_im, ef_re);
+      // Interleave real and imaginary parts.
+      const float32x4x2_t g_n_h = vzipq_f32(e, f);
+      // Store
+      vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]);
+      vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);
+    }
+    // ... and fixup the first imaginary entry.
+    fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
+                   -aec->xfBuf[1][xPos + PART_LEN],
+                   ef[0][PART_LEN],
+                   ef[1][PART_LEN]);
+
+    aec_rdft_inverse_128(fft);
+    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
+
+    // fft scaling
+    {
+      const float scale = 2.0f / PART_LEN2;
+      const float32x4_t scale_ps = vmovq_n_f32(scale);
+      for (j = 0; j < PART_LEN; j += 4) {
+        const float32x4_t fft_ps = vld1q_f32(&fft[j]);
+        const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps);
+        vst1q_f32(&fft[j], fft_scale);
+      }
+    }
+    aec_rdft_forward_128(fft);
+
+    {
+      const float wt1 = aec->wfBuf[1][pos];
+      aec->wfBuf[0][pos + PART_LEN] += fft[1];
+      for (j = 0; j < PART_LEN; j += 4) {
+        float32x4_t wtBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);
+        float32x4_t wtBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);
+        const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]);
+        const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]);
+        const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4);
+        wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]);
+        wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]);
+
+        vst1q_f32(&aec->wfBuf[0][pos + j], wtBuf_re);
+        vst1q_f32(&aec->wfBuf[1][pos + j], wtBuf_im);
+      }
+      aec->wfBuf[1][pos] = wt1;
+    }
+  }
+}
+
+static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
+  // a^b = exp2(b * log2(a))
+  //   exp2(x) and log2(x) are calculated using polynomial approximations.
+  float32x4_t log2_a, b_log2_a, a_exp_b;
+
+  // Calculate log2(x), x = a.
+  {
+    // To calculate log2(x), we decompose x like this:
+    //   x = y * 2^n
+    //     n is an integer
+    //     y is in the [1.0, 2.0) range
+    //
+    //   log2(x) = log2(y) + n
+    //     n       can be evaluated by playing with float representation.
+    //     log2(y) in a small range can be approximated, this code uses an order
+    //             five polynomial approximation. The coefficients have been
+    //             estimated with the Remez algorithm and the resulting
+    //             polynomial has a maximum relative error of 0.00086%.
+
+    // Compute n.
+    //    This is done by masking the exponent, shifting it into the top bit of
+    //    the mantissa, putting eight into the biased exponent (to shift/
+    //    compensate the fact that the exponent has been shifted in the top/
+    //    fractional part and finally getting rid of the implicit leading one
+    //    from the mantissa by substracting it out.
+    const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000);
+    const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000);
+    const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000);
+    const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a),
+                                       vec_float_exponent_mask);
+    const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa);
+    const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent);
+    const float32x4_t n =
+        vsubq_f32(vreinterpretq_f32_u32(n_0),
+                  vreinterpretq_f32_u32(vec_implicit_leading_one));
+    // Compute y.
+    const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF);
+    const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000);
+    const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a),
+                                          vec_mantissa_mask);
+    const float32x4_t y =
+        vreinterpretq_f32_u32(vorrq_u32(mantissa,
+                                        vec_zero_biased_exponent_is_one));
+    // Approximate log2(y) ~= (y - 1) * pol5(y).
+    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
+    const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f);
+    const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f);
+    const float32x4_t C3 = vdupq_n_f32(-1.2315303f);
+    const float32x4_t C2 = vdupq_n_f32(2.5988452f);
+    const float32x4_t C1 = vdupq_n_f32(-3.3241990f);
+    const float32x4_t C0 = vdupq_n_f32(3.1157899f);
+    float32x4_t pol5_y = C5;
+    pol5_y = vmlaq_f32(C4, y, pol5_y);
+    pol5_y = vmlaq_f32(C3, y, pol5_y);
+    pol5_y = vmlaq_f32(C2, y, pol5_y);
+    pol5_y = vmlaq_f32(C1, y, pol5_y);
+    pol5_y = vmlaq_f32(C0, y, pol5_y);
+    const float32x4_t y_minus_one =
+        vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one));
+    const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y);
+
+    // Combine parts.
+    log2_a = vaddq_f32(n, log2_y);
+  }
+
+  // b * log2(a)
+  b_log2_a = vmulq_f32(b, log2_a);
+
+  // Calculate exp2(x), x = b * log2(a).
+  {
+    // To calculate 2^x, we decompose x like this:
+    //   x = n + y
+    //     n is an integer, the value of x - 0.5 rounded down, therefore
+    //     y is in the [0.5, 1.5) range
+    //
+    //   2^x = 2^n * 2^y
+    //     2^n can be evaluated by playing with float representation.
+    //     2^y in a small range can be approximated, this code uses an order two
+    //         polynomial approximation. The coefficients have been estimated
+    //         with the Remez algorithm and the resulting polynomial has a
+    //         maximum relative error of 0.17%.
+    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
+    const float32x4_t max_input = vdupq_n_f32(129.f);
+    const float32x4_t min_input = vdupq_n_f32(-126.99999f);
+    const float32x4_t x_min = vminq_f32(b_log2_a, max_input);
+    const float32x4_t x_max = vmaxq_f32(x_min, min_input);
+    // Compute n.
+    const float32x4_t half = vdupq_n_f32(0.5f);
+    const float32x4_t x_minus_half = vsubq_f32(x_max, half);
+    const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half);
+
+    // Compute 2^n.
+    const int32x4_t float_exponent_bias = vdupq_n_s32(127);
+    const int32x4_t two_n_exponent =
+        vaddq_s32(x_minus_half_floor, float_exponent_bias);
+    const float32x4_t two_n =
+        vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift));
+    // Compute y.
+    const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor));
+
+    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
+    const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f);
+    const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f);
+    const float32x4_t C0 = vdupq_n_f32(1.0017247f);
+    float32x4_t exp2_y = C2;
+    exp2_y = vmlaq_f32(C1, y, exp2_y);
+    exp2_y = vmlaq_f32(C0, y, exp2_y);
+
+    // Combine parts.
+    a_exp_b = vmulq_f32(exp2_y, two_n);
+  }
+
+  return a_exp_b;
+}
+
+static void OverdriveAndSuppressNEON(AecCore* aec,
+                                     float hNl[PART_LEN1],
+                                     const float hNlFb,
+                                     float efw[2][PART_LEN1]) {
+  int i;
+  const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb);
+  const float32x4_t vec_one = vdupq_n_f32(1.0f);
+  const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f);
+  const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm);
+
+  // vectorized code (four at once)
+  for (i = 0; i + 3 < PART_LEN1; i += 4) {
+    // Weight subbands
+    float32x4_t vec_hNl = vld1q_f32(&hNl[i]);
+    const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]);
+    const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb);
+    const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve,
+                                                        vec_hNlFb);
+    const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve);
+    const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve,
+                                                          vec_hNl);
+    const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger),
+                                         vreinterpretq_u32_f32(vec_hNl));
+    const float32x4_t vec_one_weightCurve_add =
+        vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl);
+    const uint32x4_t vec_if1 =
+        vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add));
+
+    vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1));
+
+    {
+      const float32x4_t vec_overDriveCurve =
+          vld1q_f32(&WebRtcAec_overDriveCurve[i]);
+      const float32x4_t vec_overDriveSm_overDriveCurve =
+          vmulq_f32(vec_overDriveSm, vec_overDriveCurve);
+      vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve);
+      vst1q_f32(&hNl[i], vec_hNl);
+    }
+
+    // Suppress error signal
+    {
+      float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]);
+      float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]);
+      vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl);
+      vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl);
+
+      // Ooura fft returns incorrect sign on imaginary component. It matters
+      // here because we are making an additive change with comfort noise.
+      vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one);
+      vst1q_f32(&efw[0][i], vec_efw_re);
+      vst1q_f32(&efw[1][i], vec_efw_im);
+    }
+  }
+
+  // scalar code for the remaining items.
+  for (; i < PART_LEN1; i++) {
+    // Weight subbands
+    if (hNl[i] > hNlFb) {
+      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
+               (1 - WebRtcAec_weightCurve[i]) * hNl[i];
+    }
+
+    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
+
+    // Suppress error signal
+    efw[0][i] *= hNl[i];
+    efw[1][i] *= hNl[i];
+
+    // Ooura fft returns incorrect sign on imaginary component. It matters
+    // here because we are making an additive change with comfort noise.
+    efw[1][i] *= -1;
+  }
+}
+
+static int PartitionDelay(const AecCore* aec) {
+  // Measures the energy in each filter partition and returns the partition with
+  // highest energy.
+  // TODO(bjornv): Spread computational cost by computing one partition per
+  // block?
+  float wfEnMax = 0;
+  int i;
+  int delay = 0;
+
+  for (i = 0; i < aec->num_partitions; i++) {
+    int j;
+    int pos = i * PART_LEN1;
+    float wfEn = 0;
+    float32x4_t vec_wfEn = vdupq_n_f32(0.0f);
+    // vectorized code (four at once)
+    for (j = 0; j + 3 < PART_LEN1; j += 4) {
+      const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]);
+      const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]);
+      vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0);
+      vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1);
+    }
+    {
+      float32x2_t vec_total;
+      // A B C D
+      vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn));
+      // A+B C+D
+      vec_total = vpadd_f32(vec_total, vec_total);
+      // A+B+C+D A+B+C+D
+      wfEn = vget_lane_f32(vec_total, 0);
+    }
+
+    // scalar code for the remaining items.
+    for (; j < PART_LEN1; j++) {
+      wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
+              aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
+    }
+
+    if (wfEn > wfEnMax) {
+      wfEnMax = wfEn;
+      delay = i;
+    }
+  }
+  return delay;
+}
+
+// Updates the following smoothed  Power Spectral Densities (PSD):
+//  - sd  : near-end
+//  - se  : residual echo
+//  - sx  : far-end
+//  - sde : cross-PSD of near-end and residual echo
+//  - sxd : cross-PSD of near-end and far-end
+//
+// In addition to updating the PSDs, also the filter diverge state is determined
+// upon actions are taken.
+static void SmoothedPSD(AecCore* aec,
+                        float efw[2][PART_LEN1],
+                        float dfw[2][PART_LEN1],
+                        float xfw[2][PART_LEN1]) {
+  // Power estimate smoothing coefficients.
+  const float* ptrGCoh = aec->extended_filter_enabled
+      ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
+      : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
+  int i;
+  float sdSum = 0, seSum = 0;
+  const float32x4_t vec_15 =  vdupq_n_f32(WebRtcAec_kMinFarendPSD);
+  float32x4_t vec_sdSum = vdupq_n_f32(0.0f);
+  float32x4_t vec_seSum = vdupq_n_f32(0.0f);
+
+  for (i = 0; i + 3 < PART_LEN1; i += 4) {
+    const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]);
+    const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]);
+    const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]);
+    const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]);
+    const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]);
+    const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]);
+    float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]);
+    float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]);
+    float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]);
+    float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0);
+    float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0);
+    float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0);
+
+    vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1);
+    vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1);
+    vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1);
+    vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15);
+    vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]);
+    vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]);
+    vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]);
+
+    vst1q_f32(&aec->sd[i], vec_sd);
+    vst1q_f32(&aec->se[i], vec_se);
+    vst1q_f32(&aec->sx[i], vec_sx);
+
+    {
+      float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
+      float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0);
+      float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1);
+      vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]);
+      vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]);
+      vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1);
+      vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0);
+      vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]);
+      vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]);
+      vst2q_f32(&aec->sde[i][0], vec_sde);
+    }
+
+    {
+      float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
+      float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0);
+      float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1);
+      vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]);
+      vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]);
+      vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1);
+      vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0);
+      vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]);
+      vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]);
+      vst2q_f32(&aec->sxd[i][0], vec_sxd);
+    }
+
+    vec_sdSum = vaddq_f32(vec_sdSum, vec_sd);
+    vec_seSum = vaddq_f32(vec_seSum, vec_se);
+  }
+  {
+    float32x2_t vec_sdSum_total;
+    float32x2_t vec_seSum_total;
+    // A B C D
+    vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum),
+                                vget_high_f32(vec_sdSum));
+    vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum),
+                                vget_high_f32(vec_seSum));
+    // A+B C+D
+    vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total);
+    vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total);
+    // A+B+C+D A+B+C+D
+    sdSum = vget_lane_f32(vec_sdSum_total, 0);
+    seSum = vget_lane_f32(vec_seSum_total, 0);
+  }
+
+  // scalar code for the remaining items.
+  for (; i < PART_LEN1; i++) {
+    aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
+                 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
+    aec->se[i] = ptrGCoh[0] * aec->se[i] +
+                 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
+    // We threshold here to protect against the ill-effects of a zero farend.
+    // The threshold is not arbitrarily chosen, but balances protection and
+    // adverse interaction with the algorithm's tuning.
+    // TODO(bjornv): investigate further why this is so sensitive.
+    aec->sx[i] =
+        ptrGCoh[0] * aec->sx[i] +
+        ptrGCoh[1] * WEBRTC_SPL_MAX(
+            xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
+            WebRtcAec_kMinFarendPSD);
+
+    aec->sde[i][0] =
+        ptrGCoh[0] * aec->sde[i][0] +
+        ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
+    aec->sde[i][1] =
+        ptrGCoh[0] * aec->sde[i][1] +
+        ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
+
+    aec->sxd[i][0] =
+        ptrGCoh[0] * aec->sxd[i][0] +
+        ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
+    aec->sxd[i][1] =
+        ptrGCoh[0] * aec->sxd[i][1] +
+        ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
+
+    sdSum += aec->sd[i];
+    seSum += aec->se[i];
+  }
+
+  // Divergent filter safeguard.
+  aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;
+
+  if (aec->divergeState)
+    memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1);
+
+  // Reset if error is significantly larger than nearend (13 dB).
+  if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum))
+    memset(aec->wfBuf, 0, sizeof(aec->wfBuf));
+}
+
+// Window time domain data to be used by the fft.
+__inline static void WindowData(float* x_windowed, const float* x) {
+  int i;
+  for (i = 0; i < PART_LEN; i += 4) {
+    const float32x4_t vec_Buf1 = vld1q_f32(&x[i]);
+    const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]);
+    const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]);
+    // A B C D
+    float32x4_t vec_sqrtHanning_rev =
+        vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
+    // B A D C
+    vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev);
+    // D C B A
+    vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev),
+                                       vget_low_f32(vec_sqrtHanning_rev));
+    vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning));
+    vst1q_f32(&x_windowed[PART_LEN + i],
+            vmulq_f32(vec_Buf2, vec_sqrtHanning_rev));
+  }
+}
+
+// Puts fft output data into a complex valued array.
+__inline static void StoreAsComplex(const float* data,
+                                    float data_complex[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < PART_LEN; i += 4) {
+    const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]);
+    vst1q_f32(&data_complex[0][i], vec_data.val[0]);
+    vst1q_f32(&data_complex[1][i], vec_data.val[1]);
+  }
+  // fix beginning/end values
+  data_complex[1][0] = 0;
+  data_complex[1][PART_LEN] = 0;
+  data_complex[0][0] = data[0];
+  data_complex[0][PART_LEN] = data[1];
+}
+
+static void SubbandCoherenceNEON(AecCore* aec,
+                                 float efw[2][PART_LEN1],
+                                 float xfw[2][PART_LEN1],
+                                 float* fft,
+                                 float* cohde,
+                                 float* cohxd) {
+  float dfw[2][PART_LEN1];
+  int i;
+
+  if (aec->delayEstCtr == 0)
+    aec->delayIdx = PartitionDelay(aec);
+
+  // Use delayed far.
+  memcpy(xfw,
+         aec->xfwBuf + aec->delayIdx * PART_LEN1,
+         sizeof(xfw[0][0]) * 2 * PART_LEN1);
+
+  // Windowed near fft
+  WindowData(fft, aec->dBuf);
+  aec_rdft_forward_128(fft);
+  StoreAsComplex(fft, dfw);
+
+  // Windowed error fft
+  WindowData(fft, aec->eBuf);
+  aec_rdft_forward_128(fft);
+  StoreAsComplex(fft, efw);
+
+  SmoothedPSD(aec, efw, dfw, xfw);
+
+  {
+    const float32x4_t vec_1eminus10 =  vdupq_n_f32(1e-10f);
+
+    // Subband coherence
+    for (i = 0; i + 3 < PART_LEN1; i += 4) {
+      const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]);
+      const float32x4_t vec_se = vld1q_f32(&aec->se[i]);
+      const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]);
+      const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se);
+      const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx);
+      float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
+      float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
+      float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]);
+      float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]);
+      vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]);
+      vec_cohde = vdivq_f32(vec_cohde, vec_sdse);
+      vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]);
+      vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx);
+
+      vst1q_f32(&cohde[i], vec_cohde);
+      vst1q_f32(&cohxd[i], vec_cohxd);
+    }
+  }
+  // scalar code for the remaining items.
+  for (; i < PART_LEN1; i++) {
+    cohde[i] =
+        (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
+        (aec->sd[i] * aec->se[i] + 1e-10f);
+    cohxd[i] =
+        (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
+        (aec->sx[i] * aec->sd[i] + 1e-10f);
+  }
+}
+
+void WebRtcAec_InitAec_neon(void) {
+  WebRtcAec_FilterFar = FilterFarNEON;
+  WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON;
+  WebRtcAec_FilterAdaptation = FilterAdaptationNEON;
+  WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON;
+  WebRtcAec_SubbandCoherence = SubbandCoherenceNEON;
+}
+
--- a/webrtc/modules/audio_processing/aec/aec_core_sse2.c
+++ b/webrtc/modules/audio_processing/aec/aec_core_sse2.c
@@ -12,35 +12,33 @@
 * The core AEC algorithm, SSE2 version of speed-critical functions.
 */

-#include "typedefs.h"
-
-#if defined(WEBRTC_USE_SSE2)
 #include <emmintrin.h>
 #include <math.h>
+#include <string.h>  // memset

-#include "aec_core.h"
-#include "aec_rdft.h"
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/modules/audio_processing/aec/aec_common.h"
+#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"

-__inline static float MulRe(float aRe, float aIm, float bRe, float bIm)
-{
+__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
  return aRe * bRe - aIm * bIm;
 }

-__inline static float MulIm(float aRe, float aIm, float bRe, float bIm)
-{
+__inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
  return aRe * bIm + aIm * bRe;
 }

-static void FilterFarSSE2(aec_t *aec, float yf[2][PART_LEN1])
-{
+static void FilterFarSSE2(AecCore* aec, float yf[2][PART_LEN1]) {
  int i;
-  for (i = 0; i < NR_PART; i++) {
+  const int num_partitions = aec->num_partitions;
+  for (i = 0; i < num_partitions; i++) {
    int j;
    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
    int pos = i * PART_LEN1;
    // Check for wrap
-    if (i + aec->xfBufBlockPos >= NR_PART) {
-      xPos -= NR_PART*(PART_LEN1);
+    if (i + aec->xfBufBlockPos >= num_partitions) {
+      xPos -= num_partitions * (PART_LEN1);
    }

    // vectorized code (four at once)
@@ -64,19 +62,25 @@ static void FilterFarSSE2(aec_t *aec, float yf[2][PART_LEN1])
    }
    // scalar code for the remaining items.
    for (; j < PART_LEN1; j++) {
-      yf[0][j] += MulRe(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j],
-                        aec->wfBuf[0][ pos + j], aec->wfBuf[1][ pos + j]);
-      yf[1][j] += MulIm(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j],
-                        aec->wfBuf[0][ pos + j], aec->wfBuf[1][ pos + j]);
+      yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],
+                        aec->xfBuf[1][xPos + j],
+                        aec->wfBuf[0][pos + j],
+                        aec->wfBuf[1][pos + j]);
+      yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],
+                        aec->xfBuf[1][xPos + j],
+                        aec->wfBuf[0][pos + j],
+                        aec->wfBuf[1][pos + j]);
    }
  }
 }

-static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
-{
+static void ScaleErrorSignalSSE2(AecCore* aec, float ef[2][PART_LEN1]) {
  const __m128 k1e_10f = _mm_set1_ps(1e-10f);
-  const __m128 kThresh = _mm_set1_ps(aec->errThresh);
-  const __m128 kMu = _mm_set1_ps(aec->mu);
+  const __m128 kMu = aec->extended_filter_enabled ? _mm_set1_ps(kExtendedMu)
+                                                  : _mm_set1_ps(aec->normal_mu);
+  const __m128 kThresh = aec->extended_filter_enabled
+                             ? _mm_set1_ps(kExtendedErrorThreshold)
+                             : _mm_set1_ps(aec->normal_error_threshold);

  int i;
  // vectorized code (four at once)
@@ -110,36 +114,46 @@ static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
    _mm_storeu_ps(&ef[1][i], ef_im);
  }
  // scalar code for the remaining items.
-  for (; i < (PART_LEN1); i++) {
-    float absEf;
-    ef[0][i] /= (aec->xPow[i] + 1e-10f);
-    ef[1][i] /= (aec->xPow[i] + 1e-10f);
-    absEf = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
+  {
+    const float mu =
+        aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
+    const float error_threshold = aec->extended_filter_enabled
+                                      ? kExtendedErrorThreshold
+                                      : aec->normal_error_threshold;
+    for (; i < (PART_LEN1); i++) {
+      float abs_ef;
+      ef[0][i] /= (aec->xPow[i] + 1e-10f);
+      ef[1][i] /= (aec->xPow[i] + 1e-10f);
+      abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);

-    if (absEf > aec->errThresh) {
-      absEf = aec->errThresh / (absEf + 1e-10f);
-      ef[0][i] *= absEf;
-      ef[1][i] *= absEf;
+      if (abs_ef > error_threshold) {
+        abs_ef = error_threshold / (abs_ef + 1e-10f);
+        ef[0][i] *= abs_ef;
+        ef[1][i] *= abs_ef;
+      }
+
+      // Stepsize factor
+      ef[0][i] *= mu;
+      ef[1][i] *= mu;
    }
-
-    // Stepsize factor
-    ef[0][i] *= aec->mu;
-    ef[1][i] *= aec->mu;
  }
 }

-static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1]) {
+static void FilterAdaptationSSE2(AecCore* aec,
+                                 float* fft,
+                                 float ef[2][PART_LEN1]) {
  int i, j;
-  for (i = 0; i < NR_PART; i++) {
-    int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
+  const int num_partitions = aec->num_partitions;
+  for (i = 0; i < num_partitions; i++) {
+    int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1);
    int pos = i * PART_LEN1;
    // Check for wrap
-    if (i + aec->xfBufBlockPos >= NR_PART) {
-      xPos -= NR_PART * PART_LEN1;
+    if (i + aec->xfBufBlockPos >= num_partitions) {
+      xPos -= num_partitions * PART_LEN1;
    }

    // Process the whole array...
-    for (j = 0; j < PART_LEN; j+= 4) {
+    for (j = 0; j < PART_LEN; j += 4) {
      // Load xfBuf and ef.
      const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
      const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
@@ -158,22 +172,23 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1])
      const __m128 g = _mm_unpacklo_ps(e, f);
      const __m128 h = _mm_unpackhi_ps(e, f);
      // Store
-      _mm_storeu_ps(&fft[2*j + 0], g);
-      _mm_storeu_ps(&fft[2*j + 4], h);
+      _mm_storeu_ps(&fft[2 * j + 0], g);
+      _mm_storeu_ps(&fft[2 * j + 4], h);
    }
    // ... and fixup the first imaginary entry.
    fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
                   -aec->xfBuf[1][xPos + PART_LEN],
-                   ef[0][PART_LEN], ef[1][PART_LEN]);
+                   ef[0][PART_LEN],
+                   ef[1][PART_LEN]);

    aec_rdft_inverse_128(fft);
-    memset(fft + PART_LEN, 0, sizeof(float)*PART_LEN);
+    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);

    // fft scaling
    {
      float scale = 2.0f / PART_LEN2;
      const __m128 scale_ps = _mm_load_ps1(&scale);
-      for (j = 0; j < PART_LEN; j+=4) {
+      for (j = 0; j < PART_LEN; j += 4) {
        const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
        const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
        _mm_storeu_ps(&fft[j], fft_scale);
@@ -184,13 +199,15 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1])
    {
      float wt1 = aec->wfBuf[1][pos];
      aec->wfBuf[0][pos + PART_LEN] += fft[1];
-      for (j = 0; j < PART_LEN; j+= 4) {
+      for (j = 0; j < PART_LEN; j += 4) {
        __m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
        __m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
        const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
        const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
-        const __m128 fft_re = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2 ,0));
-        const __m128 fft_im = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3 ,1));
+        const __m128 fft_re =
+            _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0));
+        const __m128 fft_im =
+            _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));
        wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
        wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
        _mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);
@@ -201,8 +218,7 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1])
  }
 }

-static __m128 mm_pow_ps(__m128 a, __m128 b)
-{
+static __m128 mm_pow_ps(__m128 a, __m128 b) {
  // a^b = exp2(b * log2(a))
  //   exp2(x) and log2(x) are calculated using polynomial approximations.
  __m128 log2_a, b_log2_a, a_exp_b;
@@ -227,55 +243,55 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
    //    compensate the fact that the exponent has been shifted in the top/
    //    fractional part and finally getting rid of the implicit leading one
    //    from the mantissa by substracting it out.
-    static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END =
-        {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
-    static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END =
-        {0x43800000, 0x43800000, 0x43800000, 0x43800000};
-    static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END =
-        {0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
+    static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = {
+        0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
+    static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = {
+        0x43800000, 0x43800000, 0x43800000, 0x43800000};
+    static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = {
+        0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
    static const int shift_exponent_into_top_mantissa = 8;
-    const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask));
-    const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(two_n),
-        shift_exponent_into_top_mantissa));
-    const __m128 n_0 = _mm_or_ps(n_1, *((__m128 *)eight_biased_exponent));
-    const __m128 n   = _mm_sub_ps(n_0,  *((__m128 *)implicit_leading_one));
+    const __m128 two_n = _mm_and_ps(a, *((__m128*)float_exponent_mask));
+    const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(
+        _mm_castps_si128(two_n), shift_exponent_into_top_mantissa));
+    const __m128 n_0 = _mm_or_ps(n_1, *((__m128*)eight_biased_exponent));
+    const __m128 n = _mm_sub_ps(n_0, *((__m128*)implicit_leading_one));

    // Compute y.
-    static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END =
-        {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
-    static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END =
-        {0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
-    const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask));
-    const __m128 y        = _mm_or_ps(
-        mantissa,  *((__m128 *)zero_biased_exponent_is_one));
+    static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = {
+        0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
+    static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {
+        0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
+    const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask));
+    const __m128 y =
+        _mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one));

    // Approximate log2(y) ~= (y - 1) * pol5(y).
    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
-    static const ALIGN16_BEG float ALIGN16_END C5[4] =
-        {-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
-    static const ALIGN16_BEG float ALIGN16_END C4[4] =
-        {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
-    static const ALIGN16_BEG float ALIGN16_END C3[4] =
-        {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
-    static const ALIGN16_BEG float ALIGN16_END C2[4] =
-        {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
-    static const ALIGN16_BEG float ALIGN16_END C1[4] =
-        {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
-    static const ALIGN16_BEG float ALIGN16_END C0[4] =
-        {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
-    const __m128 pol5_y_0 = _mm_mul_ps(y,        *((__m128 *)C5));
-    const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4));
+    static const ALIGN16_BEG float ALIGN16_END C5[4] = {
+        -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
+    static const ALIGN16_BEG float ALIGN16_END
+        C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
+    static const ALIGN16_BEG float ALIGN16_END
+        C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
+    static const ALIGN16_BEG float ALIGN16_END
+        C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
+    static const ALIGN16_BEG float ALIGN16_END
+        C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
+    static const ALIGN16_BEG float ALIGN16_END
+        C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
+    const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5));
+    const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4));
    const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
-    const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3));
+    const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3));
    const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
-    const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2));
+    const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2));
    const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
-    const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1));
+    const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1));
    const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
-    const __m128 pol5_y   = _mm_add_ps(pol5_y_8, *((__m128 *)C0));
-    const __m128 y_minus_one = _mm_sub_ps(
-        y, *((__m128 *)zero_biased_exponent_is_one));
-    const __m128 log2_y = _mm_mul_ps(y_minus_one ,  pol5_y);
+    const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0));
+    const __m128 y_minus_one =
+        _mm_sub_ps(y, *((__m128*)zero_biased_exponent_is_one));
+    const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y);

    // Combine parts.
    log2_a = _mm_add_ps(n, log2_y);
@@ -299,38 +315,38 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
    //         maximum relative error of 0.17%.

    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
-    static const ALIGN16_BEG float max_input[4] ALIGN16_END =
-        {129.f, 129.f, 129.f, 129.f};
-    static const ALIGN16_BEG float min_input[4] ALIGN16_END =
-        {-126.99999f, -126.99999f, -126.99999f, -126.99999f};
-    const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input));
-    const __m128 x_max = _mm_max_ps(x_min,    *((__m128 *)min_input));
+    static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f,
+                                                               129.f, 129.f};
+    static const ALIGN16_BEG float min_input[4] ALIGN16_END = {
+        -126.99999f, -126.99999f, -126.99999f, -126.99999f};
+    const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input));
+    const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input));
    // Compute n.
-    static const ALIGN16_BEG float half[4] ALIGN16_END =
-        {0.5f, 0.5f, 0.5f, 0.5f};
-    const __m128  x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half));
+    static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f,
+                                                          0.5f, 0.5f};
+    const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half));
    const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
    // Compute 2^n.
-    static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END =
-        {127, 127, 127, 127};
+    static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {
+        127, 127, 127, 127};
    static const int float_exponent_shift = 23;
-    const __m128i two_n_exponent = _mm_add_epi32(
-        x_minus_half_floor, *((__m128i *)float_exponent_bias));
-    const __m128  two_n = _mm_castsi128_ps(_mm_slli_epi32(
-        two_n_exponent, float_exponent_shift));
+    const __m128i two_n_exponent =
+        _mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias));
+    const __m128 two_n =
+        _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift));
    // Compute y.
    const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
-    static const ALIGN16_BEG float C2[4] ALIGN16_END =
-        {3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
-    static const ALIGN16_BEG float C1[4] ALIGN16_END =
-        {6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
-    static const ALIGN16_BEG float C0[4] ALIGN16_END =
-        {1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f};
-    const __m128 exp2_y_0 = _mm_mul_ps(y,        *((__m128 *)C2));
-    const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1));
+    static const ALIGN16_BEG float C2[4] ALIGN16_END = {
+        3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
+    static const ALIGN16_BEG float C1[4] ALIGN16_END = {
+        6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
+    static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f,
+                                                        1.0017247f, 1.0017247f};
+    const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128*)C2));
+    const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128*)C1));
    const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
-    const __m128 exp2_y   = _mm_add_ps(exp2_y_2, *((__m128 *)C0));
+    const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128*)C0));

    // Combine parts.
    a_exp_b = _mm_mul_ps(exp2_y, two_n);
@@ -338,10 +354,8 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
  return a_exp_b;
 }

-extern const float WebRtcAec_weightCurve[65];
-extern const float WebRtcAec_overDriveCurve[65];
-
-static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
+static void OverdriveAndSuppressSSE2(AecCore* aec,
+                                     float hNl[PART_LEN1],
                                     const float hNlFb,
                                     float efw[2][PART_LEN1]) {
  int i;
@@ -350,26 +364,25 @@ static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
  const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
  const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
  // vectorized code (four at once)
-  for (i = 0; i + 3 < PART_LEN1; i+=4) {
+  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    // Weight subbands
    __m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
    const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
    const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
-    const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(
-        vec_weightCurve, vec_hNlFb);
+    const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(vec_weightCurve, vec_hNlFb);
    const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
-    const __m128 vec_one_weightCurve_hNl = _mm_mul_ps(
-        vec_one_weightCurve, vec_hNl);
+    const __m128 vec_one_weightCurve_hNl =
+        _mm_mul_ps(vec_one_weightCurve, vec_hNl);
    const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
    const __m128 vec_if1 = _mm_and_ps(
        bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
    vec_hNl = _mm_or_ps(vec_if0, vec_if1);

    {
-      const __m128 vec_overDriveCurve = _mm_loadu_ps(
-          &WebRtcAec_overDriveCurve[i]);
-      const __m128 vec_overDriveSm_overDriveCurve = _mm_mul_ps(
-          vec_overDriveSm, vec_overDriveCurve);
+      const __m128 vec_overDriveCurve =
+          _mm_loadu_ps(&WebRtcAec_overDriveCurve[i]);
+      const __m128 vec_overDriveSm_overDriveCurve =
+          _mm_mul_ps(vec_overDriveSm, vec_overDriveCurve);
      vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
      _mm_storeu_ps(&hNl[i], vec_hNl);
    }
@@ -393,7 +406,7 @@ static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
    // Weight subbands
    if (hNl[i] > hNlFb) {
      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
-          (1 - WebRtcAec_weightCurve[i]) * hNl[i];
+               (1 - WebRtcAec_weightCurve[i]) * hNl[i];
    }
    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);

@@ -407,11 +420,312 @@ static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
  }
 }

+__inline static void _mm_add_ps_4x1(__m128 sum, float *dst) {
+  // A+B C+D
+  sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)));
+  // A+B+C+D A+B+C+D
+  sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
+  _mm_store_ss(dst, sum);
+}
+static int PartitionDelay(const AecCore* aec) {
+  // Measures the energy in each filter partition and returns the partition with
+  // highest energy.
+  // TODO(bjornv): Spread computational cost by computing one partition per
+  // block?
+  float wfEnMax = 0;
+  int i;
+  int delay = 0;
+
+  for (i = 0; i < aec->num_partitions; i++) {
+    int j;
+    int pos = i * PART_LEN1;
+    float wfEn = 0;
+    __m128 vec_wfEn = _mm_set1_ps(0.0f);
+    // vectorized code (four at once)
+    for (j = 0; j + 3 < PART_LEN1; j += 4) {
+      const __m128 vec_wfBuf0 = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
+      const __m128 vec_wfBuf1 = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
+      vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0));
+      vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1));
+    }
+    _mm_add_ps_4x1(vec_wfEn, &wfEn);
+
+    // scalar code for the remaining items.
+    for (; j < PART_LEN1; j++) {
+      wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
+              aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
+    }
+
+    if (wfEn > wfEnMax) {
+      wfEnMax = wfEn;
+      delay = i;
+    }
+  }
+  return delay;
+}
+
+// Updates the following smoothed  Power Spectral Densities (PSD):
+//  - sd  : near-end
+//  - se  : residual echo
+//  - sx  : far-end
+//  - sde : cross-PSD of near-end and residual echo
+//  - sxd : cross-PSD of near-end and far-end
+//
+// In addition to updating the PSDs, also the filter diverge state is determined
+// upon actions are taken.
+static void SmoothedPSD(AecCore* aec,
+                        float efw[2][PART_LEN1],
+                        float dfw[2][PART_LEN1],
+                        float xfw[2][PART_LEN1]) {
+  // Power estimate smoothing coefficients.
+  const float* ptrGCoh = aec->extended_filter_enabled
+      ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
+      : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
+  int i;
+  float sdSum = 0, seSum = 0;
+  const __m128 vec_15 =  _mm_set1_ps(WebRtcAec_kMinFarendPSD);
+  const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]);
+  const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]);
+  __m128 vec_sdSum = _mm_set1_ps(0.0f);
+  __m128 vec_seSum = _mm_set1_ps(0.0f);
+
+  for (i = 0; i + 3 < PART_LEN1; i += 4) {
+    const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]);
+    const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]);
+    const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]);
+    const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]);
+    const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]);
+    const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]);
+    __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0);
+    __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0);
+    __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0);
+    __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0);
+    __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0);
+    __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0);
+    vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1));
+    vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1));
+    vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1));
+    vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15);
+    vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1));
+    vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1));
+    vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1));
+    _mm_storeu_ps(&aec->sd[i], vec_sd);
+    _mm_storeu_ps(&aec->se[i], vec_se);
+    _mm_storeu_ps(&aec->sx[i], vec_sx);
+
+    {
+      const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]);
+      const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);
+      __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,
+                                    _MM_SHUFFLE(2, 0, 2, 0));
+      __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,
+                                    _MM_SHUFFLE(3, 1, 3, 1));
+      __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0);
+      __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1);
+      vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
+      vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
+      vec_dfwefw0011 = _mm_add_ps(vec_dfwefw0011,
+                                  _mm_mul_ps(vec_dfw1, vec_efw1));
+      vec_dfwefw0110 = _mm_sub_ps(vec_dfwefw0110,
+                                  _mm_mul_ps(vec_dfw1, vec_efw0));
+      vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1));
+      vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1));
+      _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b));
+      _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));
+    }
+
+    {
+      const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]);
+      const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);
+      __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,
+                                    _MM_SHUFFLE(2, 0, 2, 0));
+      __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,
+                                    _MM_SHUFFLE(3, 1, 3, 1));
+      __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0);
+      __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1);
+      vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
+      vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
+      vec_dfwxfw0011 = _mm_add_ps(vec_dfwxfw0011,
+                                  _mm_mul_ps(vec_dfw1, vec_xfw1));
+      vec_dfwxfw0110 = _mm_sub_ps(vec_dfwxfw0110,
+                                  _mm_mul_ps(vec_dfw1, vec_xfw0));
+      vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1));
+      vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1));
+      _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b));
+      _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));
+    }
+
+    vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd);
+    vec_seSum = _mm_add_ps(vec_seSum, vec_se);
+  }
+
+  _mm_add_ps_4x1(vec_sdSum, &sdSum);
+  _mm_add_ps_4x1(vec_seSum, &seSum);
+
+  for (; i < PART_LEN1; i++) {
+    aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
+                 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
+    aec->se[i] = ptrGCoh[0] * aec->se[i] +
+                 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
+    // We threshold here to protect against the ill-effects of a zero farend.
+    // The threshold is not arbitrarily chosen, but balances protection and
+    // adverse interaction with the algorithm's tuning.
+    // TODO(bjornv): investigate further why this is so sensitive.
+    aec->sx[i] =
+        ptrGCoh[0] * aec->sx[i] +
+        ptrGCoh[1] * WEBRTC_SPL_MAX(
+            xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
+            WebRtcAec_kMinFarendPSD);
+
+    aec->sde[i][0] =
+        ptrGCoh[0] * aec->sde[i][0] +
+        ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
+    aec->sde[i][1] =
+        ptrGCoh[0] * aec->sde[i][1] +
+        ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
+
+    aec->sxd[i][0] =
+        ptrGCoh[0] * aec->sxd[i][0] +
+        ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
+    aec->sxd[i][1] =
+        ptrGCoh[0] * aec->sxd[i][1] +
+        ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
+
+    sdSum += aec->sd[i];
+    seSum += aec->se[i];
+  }
+
+  // Divergent filter safeguard.
+  aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;
+
+  if (aec->divergeState)
+    memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1);
+
+  // Reset if error is significantly larger than nearend (13 dB).
+  if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum))
+    memset(aec->wfBuf, 0, sizeof(aec->wfBuf));
+}
+
+// Window time domain data to be used by the fft.
+__inline static void WindowData(float* x_windowed, const float* x) {
+  int i;
+  for (i = 0; i < PART_LEN; i += 4) {
+    const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]);
+    const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]);
+    const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]);
+    // A B C D
+    __m128 vec_sqrtHanning_rev =
+        _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
+    // D C B A
+    vec_sqrtHanning_rev =
+        _mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev,
+                       _MM_SHUFFLE(0, 1, 2, 3));
+    _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning));
+    _mm_storeu_ps(&x_windowed[PART_LEN + i],
+                  _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev));
+  }
+}
+
+// Puts fft output data into a complex valued array.
+__inline static void StoreAsComplex(const float* data,
+                                    float data_complex[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < PART_LEN; i += 4) {
+    const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]);
+    const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]);
+    const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4,
+                                        _MM_SHUFFLE(2, 0, 2, 0));
+    const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4,
+                                        _MM_SHUFFLE(3, 1, 3, 1));
+    _mm_storeu_ps(&data_complex[0][i], vec_a);
+    _mm_storeu_ps(&data_complex[1][i], vec_b);
+  }
+  // fix beginning/end values
+  data_complex[1][0] = 0;
+  data_complex[1][PART_LEN] = 0;
+  data_complex[0][0] = data[0];
+  data_complex[0][PART_LEN] = data[1];
+}
+
+static void SubbandCoherenceSSE2(AecCore* aec,
+                                 float efw[2][PART_LEN1],
+                                 float xfw[2][PART_LEN1],
+                                 float* fft,
+                                 float* cohde,
+                                 float* cohxd) {
+  float dfw[2][PART_LEN1];
+  int i;
+
+  if (aec->delayEstCtr == 0)
+    aec->delayIdx = PartitionDelay(aec);
+
+  // Use delayed far.
+  memcpy(xfw,
+         aec->xfwBuf + aec->delayIdx * PART_LEN1,
+         sizeof(xfw[0][0]) * 2 * PART_LEN1);
+
+  // Windowed near fft
+  WindowData(fft, aec->dBuf);
+  aec_rdft_forward_128(fft);
+  StoreAsComplex(fft, dfw);
+
+  // Windowed error fft
+  WindowData(fft, aec->eBuf);
+  aec_rdft_forward_128(fft);
+  StoreAsComplex(fft, efw);
+
+  SmoothedPSD(aec, efw, dfw, xfw);
+
+  {
+    const __m128 vec_1eminus10 =  _mm_set1_ps(1e-10f);
+
+    // Subband coherence
+    for (i = 0; i + 3 < PART_LEN1; i += 4) {
+      const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]);
+      const __m128 vec_se = _mm_loadu_ps(&aec->se[i]);
+      const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]);
+      const __m128 vec_sdse = _mm_add_ps(vec_1eminus10,
+                                         _mm_mul_ps(vec_sd, vec_se));
+      const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10,
+                                         _mm_mul_ps(vec_sd, vec_sx));
+      const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]);
+      const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);
+      const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]);
+      const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);
+      const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,
+                                              _MM_SHUFFLE(2, 0, 2, 0));
+      const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,
+                                              _MM_SHUFFLE(3, 1, 3, 1));
+      const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,
+                                              _MM_SHUFFLE(2, 0, 2, 0));
+      const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,
+                                              _MM_SHUFFLE(3, 1, 3, 1));
+      __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0);
+      __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0);
+      vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1));
+      vec_cohde = _mm_div_ps(vec_cohde, vec_sdse);
+      vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1));
+      vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx);
+      _mm_storeu_ps(&cohde[i], vec_cohde);
+      _mm_storeu_ps(&cohxd[i], vec_cohxd);
+    }
+
+    // scalar code for the remaining items.
+    for (; i < PART_LEN1; i++) {
+      cohde[i] =
+          (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
+          (aec->sd[i] * aec->se[i] + 1e-10f);
+      cohxd[i] =
+          (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
+          (aec->sx[i] * aec->sd[i] + 1e-10f);
+    }
+  }
+}
+
 void WebRtcAec_InitAec_SSE2(void) {
  WebRtcAec_FilterFar = FilterFarSSE2;
  WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;
  WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
  WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
+  WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2;
 }
-
-#endif   // WEBRTC_USE_SSE2
--- a/webrtc/modules/audio_processing/aec/aec_rdft.c
+++ b/webrtc/modules/audio_processing/aec/aec_rdft.c
@@ -19,200 +19,193 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "aec_rdft.h"
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"

 #include <math.h>

-#include "system_wrappers/interface/cpu_features_wrapper.h"
-#include "typedefs.h"
+#include "webrtc/system_wrappers/interface/cpu_features_wrapper.h"
+#include "webrtc/typedefs.h"

-// constants shared by all paths (C, SSE2).
-float rdft_w[64];
-// constants used by the C path.
-float rdft_wk3ri_first[32];
-float rdft_wk3ri_second[32];
-// constants used by SSE2 but initialized in C path.
-ALIGN16_BEG float ALIGN16_END rdft_wk1r[32];
-ALIGN16_BEG float ALIGN16_END rdft_wk2r[32];
-ALIGN16_BEG float ALIGN16_END rdft_wk3r[32];
-ALIGN16_BEG float ALIGN16_END rdft_wk1i[32];
-ALIGN16_BEG float ALIGN16_END rdft_wk2i[32];
-ALIGN16_BEG float ALIGN16_END rdft_wk3i[32];
-ALIGN16_BEG float ALIGN16_END cftmdl_wk1r[4];
+// These tables used to be computed at run-time. For example, refer to:
+// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/aec/aec_rdft.c?r=6564
+// to see the initialization code.
+const float rdft_w[64] = {
+    1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f,
+    0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f,
+    0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f,
+    0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f,
+    0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f,
+    0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f,
+    0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f,
+    0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f,
+    0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f,
+    0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f,
+    0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f,
+    0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f,
+    0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f,
+    0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f,
+    0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f,
+    0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f,
+};
+const float rdft_wk3ri_first[16] = {
+    1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f,
+    0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f,
+    0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f,
+    0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f,
+};
+const float rdft_wk3ri_second[16] = {
+    -0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f,
+    -0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f,
+    -0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f,
+    -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
+    1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f,
+    0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f,
+    0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f,
+    0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f,
+    0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
+    0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f,
+    0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f,
+    0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
+    1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f,
+    0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f,
+    0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f,
+    0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
+    0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
+    0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f,
+    0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f,
+    0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
+    1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f,
+    0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
+    0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f,
+    -0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f,
+    0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
+    0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f,
+    0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f,
+    -0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
+    -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
+    -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
+    -0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f,
+    -0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f,
+    -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
+    -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f,
+    -0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f,
+    -0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
+    -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f,
+    -0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f,
+    -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
+    -0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f,
+    -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
+    -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f,
+    -0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f,
+    -0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
+    -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
+    -0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f,
+    -0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f,
+    -0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f,
+    -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
+    -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f,
+    -0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f,
+    -0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f,
+};
+ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
+    0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f,
+};

-static int ip[16];
+static void bitrv2_128_C(float* a) {
+  /*
+      Following things have been attempted but are no faster:
+      (a) Storing the swap indexes in a LUT (index calculations are done
+          for 'free' while waiting on memory/L1).
+      (b) Consolidate the load/store of two consecutive floats by a 64 bit
+          integer (execution is memory/L1 bound).
+      (c) Do a mix of floats and 64 bit integer to maximize register
+          utilization (execution is memory/L1 bound).
+      (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
+      (e) Hard-coding of the offsets to completely eliminates index
+          calculations.
+  */

-static void bitrv2_32or128(int n, int *ip, float *a) {
-  // n is 32 or 128
-  int j, j1, k, k1, m, m2;
+  unsigned int j, j1, k, k1;
  float xr, xi, yr, yi;

-  ip[0] = 0;
-  {
-    int l = n;
-    m = 1;
-    while ((m << 3) < l) {
-      l >>= 1;
-      for (j = 0; j < m; j++) {
-        ip[m + j] = ip[j] + l;
-      }
-      m <<= 1;
-    }
-  }
-  m2 = 2 * m;
-  for (k = 0; k < m; k++) {
+  static const int ip[4] = {0, 64, 32, 96};
+  for (k = 0; k < 4; k++) {
    for (j = 0; j < k; j++) {
      j1 = 2 * j + ip[k];
      k1 = 2 * k + ip[j];
-      xr = a[j1];
+      xr = a[j1 + 0];
      xi = a[j1 + 1];
-      yr = a[k1];
+      yr = a[k1 + 0];
      yi = a[k1 + 1];
-      a[j1] = yr;
+      a[j1 + 0] = yr;
      a[j1 + 1] = yi;
-      a[k1] = xr;
+      a[k1 + 0] = xr;
      a[k1 + 1] = xi;
-      j1 += m2;
-      k1 += 2 * m2;
-      xr = a[j1];
+      j1 += 8;
+      k1 += 16;
+      xr = a[j1 + 0];
      xi = a[j1 + 1];
-      yr = a[k1];
+      yr = a[k1 + 0];
      yi = a[k1 + 1];
-      a[j1] = yr;
+      a[j1 + 0] = yr;
      a[j1 + 1] = yi;
-      a[k1] = xr;
+      a[k1 + 0] = xr;
      a[k1 + 1] = xi;
-      j1 += m2;
-      k1 -= m2;
-      xr = a[j1];
+      j1 += 8;
+      k1 -= 8;
+      xr = a[j1 + 0];
      xi = a[j1 + 1];
-      yr = a[k1];
+      yr = a[k1 + 0];
      yi = a[k1 + 1];
-      a[j1] = yr;
+      a[j1 + 0] = yr;
      a[j1 + 1] = yi;
-      a[k1] = xr;
+      a[k1 + 0] = xr;
      a[k1 + 1] = xi;
-      j1 += m2;
-      k1 += 2 * m2;
-      xr = a[j1];
+      j1 += 8;
+      k1 += 16;
+      xr = a[j1 + 0];
      xi = a[j1 + 1];
-      yr = a[k1];
+      yr = a[k1 + 0];
      yi = a[k1 + 1];
-      a[j1] = yr;
+      a[j1 + 0] = yr;
      a[j1 + 1] = yi;
-      a[k1] = xr;
+      a[k1 + 0] = xr;
      a[k1 + 1] = xi;
    }
-    j1 = 2 * k + m2 + ip[k];
-    k1 = j1 + m2;
-    xr = a[j1];
+    j1 = 2 * k + 8 + ip[k];
+    k1 = j1 + 8;
+    xr = a[j1 + 0];
    xi = a[j1 + 1];
-    yr = a[k1];
+    yr = a[k1 + 0];
    yi = a[k1 + 1];
-    a[j1] = yr;
+    a[j1 + 0] = yr;
    a[j1 + 1] = yi;
-    a[k1] = xr;
+    a[k1 + 0] = xr;
    a[k1 + 1] = xi;
  }
 }

-static void makewt_32(void) {
-  const int nw = 32;
-  int j, nwh;
-  float delta, x, y;
-
-  ip[0] = nw;
-  ip[1] = 1;
-  nwh = nw >> 1;
-  delta = atanf(1.0f) / nwh;
-  rdft_w[0] = 1;
-  rdft_w[1] = 0;
-  rdft_w[nwh] = cosf(delta * nwh);
-  rdft_w[nwh + 1] = rdft_w[nwh];
-  for (j = 2; j < nwh; j += 2) {
-    x = cosf(delta * j);
-    y = sinf(delta * j);
-    rdft_w[j] = x;
-    rdft_w[j + 1] = y;
-    rdft_w[nw - j] = y;
-    rdft_w[nw - j + 1] = x;
-  }
-  bitrv2_32or128(nw, ip + 2, rdft_w);
-
-  // pre-calculate constants used by cft1st_128 and cftmdl_128...
-  cftmdl_wk1r[0] = rdft_w[2];
-  cftmdl_wk1r[1] = rdft_w[2];
-  cftmdl_wk1r[2] = rdft_w[2];
-  cftmdl_wk1r[3] = -rdft_w[2];
-  {
-    int k1;
-
-    for (k1 = 0, j = 0; j < 128; j += 16, k1 += 2) {
-      const int k2 = 2 * k1;
-      const float wk2r = rdft_w[k1 + 0];
-      const float wk2i = rdft_w[k1 + 1];
-      float wk1r, wk1i;
-      // ... scalar version.
-      wk1r = rdft_w[k2 + 0];
-      wk1i = rdft_w[k2 + 1];
-      rdft_wk3ri_first[k1 + 0] = wk1r - 2 * wk2i * wk1i;
-      rdft_wk3ri_first[k1 + 1] = 2 * wk2i * wk1r - wk1i;
-      wk1r = rdft_w[k2 + 2];
-      wk1i = rdft_w[k2 + 3];
-      rdft_wk3ri_second[k1 + 0] = wk1r - 2 * wk2r * wk1i;
-      rdft_wk3ri_second[k1 + 1] = 2 * wk2r * wk1r - wk1i;
-      // ... vector version.
-      rdft_wk1r[k2 + 0] = rdft_w[k2 + 0];
-      rdft_wk1r[k2 + 1] = rdft_w[k2 + 0];
-      rdft_wk1r[k2 + 2] = rdft_w[k2 + 2];
-      rdft_wk1r[k2 + 3] = rdft_w[k2 + 2];
-      rdft_wk2r[k2 + 0] = rdft_w[k1 + 0];
-      rdft_wk2r[k2 + 1] = rdft_w[k1 + 0];
-      rdft_wk2r[k2 + 2] = -rdft_w[k1 + 1];
-      rdft_wk2r[k2 + 3] = -rdft_w[k1 + 1];
-      rdft_wk3r[k2 + 0] = rdft_wk3ri_first[k1 + 0];
-      rdft_wk3r[k2 + 1] = rdft_wk3ri_first[k1 + 0];
-      rdft_wk3r[k2 + 2] = rdft_wk3ri_second[k1 + 0];
-      rdft_wk3r[k2 + 3] = rdft_wk3ri_second[k1 + 0];
-      rdft_wk1i[k2 + 0] = -rdft_w[k2 + 1];
-      rdft_wk1i[k2 + 1] = rdft_w[k2 + 1];
-      rdft_wk1i[k2 + 2] = -rdft_w[k2 + 3];
-      rdft_wk1i[k2 + 3] = rdft_w[k2 + 3];
-      rdft_wk2i[k2 + 0] = -rdft_w[k1 + 1];
-      rdft_wk2i[k2 + 1] = rdft_w[k1 + 1];
-      rdft_wk2i[k2 + 2] = -rdft_w[k1 + 0];
-      rdft_wk2i[k2 + 3] = rdft_w[k1 + 0];
-      rdft_wk3i[k2 + 0] = -rdft_wk3ri_first[k1 + 1];
-      rdft_wk3i[k2 + 1] = rdft_wk3ri_first[k1 + 1];
-      rdft_wk3i[k2 + 2] = -rdft_wk3ri_second[k1 + 1];
-      rdft_wk3i[k2 + 3] = rdft_wk3ri_second[k1 + 1];
-    }
-  }
-}
-
-static void makect_32(void) {
-  float *c = rdft_w + 32;
-  const int nc = 32;
-  int j, nch;
-  float delta;
-
-  ip[1] = nc;
-  nch = nc >> 1;
-  delta = atanf(1.0f) / nch;
-  c[0] = cosf(delta * nch);
-  c[nch] = 0.5f * c[0];
-  for (j = 1; j < nch; j++) {
-    c[j] = 0.5f * cosf(delta * j);
-    c[nc - j] = 0.5f * sinf(delta * j);
-  }
-}
-
-static void cft1st_128_C(float *a) {
+static void cft1st_128_C(float* a) {
  const int n = 128;
  int j, k1, k2;
  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;

+  // The processing of the first set of elements was simplified in C to avoid
+  // some operations (multiplication by zero or one, addition of two elements
+  // multiplied by the same weight, ...).
  x0r = a[0] + a[2];
  x0i = a[1] + a[3];
  x1r = a[0] - a[2];
@@ -311,7 +304,7 @@ static void cft1st_128_C(float *a) {
  }
 }

-static void cftmdl_128_C(float *a) {
+static void cftmdl_128_C(float* a) {
  const int l = 8;
  const int n = 128;
  const int m = 32;
@@ -320,7 +313,7 @@ static void cftmdl_128_C(float *a) {
  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;

  for (j0 = 0; j0 < l; j0 += 2) {
-    j1 = j0 +  8;
+    j1 = j0 + 8;
    j2 = j0 + 16;
    j3 = j0 + 24;
    x0r = a[j0 + 0] + a[j1 + 0];
@@ -342,7 +335,7 @@ static void cftmdl_128_C(float *a) {
  }
  wk1r = rdft_w[2];
  for (j0 = m; j0 < l + m; j0 += 2) {
-    j1 = j0 +  8;
+    j1 = j0 + 8;
    j2 = j0 + 16;
    j3 = j0 + 24;
    x0r = a[j0 + 0] + a[j1 + 0];
@@ -378,7 +371,7 @@ static void cftmdl_128_C(float *a) {
    wk3r = rdft_wk3ri_first[k1 + 0];
    wk3i = rdft_wk3ri_first[k1 + 1];
    for (j0 = k; j0 < l + k; j0 += 2) {
-      j1 = j0 +  8;
+      j1 = j0 + 8;
      j2 = j0 + 16;
      j3 = j0 + 24;
      x0r = a[j0 + 0] + a[j1 + 0];
@@ -409,7 +402,7 @@ static void cftmdl_128_C(float *a) {
    wk3r = rdft_wk3ri_second[k1 + 0];
    wk3i = rdft_wk3ri_second[k1 + 1];
    for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
-      j1 = j0 +  8;
+      j1 = j0 + 8;
      j2 = j0 + 16;
      j3 = j0 + 24;
      x0r = a[j0 + 0] + a[j1 + 0];
@@ -438,7 +431,7 @@ static void cftmdl_128_C(float *a) {
  }
 }

-static void cftfsub_128(float *a) {
+static void cftfsub_128_C(float* a) {
  int j, j1, j2, j3, l;
  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;

@@ -468,7 +461,7 @@ static void cftfsub_128(float *a) {
  }
 }

-static void cftbsub_128(float *a) {
+static void cftbsub_128_C(float* a) {
  int j, j1, j2, j3, l;
  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;

@@ -499,14 +492,14 @@ static void cftbsub_128(float *a) {
  }
 }

-static void rftfsub_128_C(float *a) {
-  const float *c = rdft_w + 32;
+static void rftfsub_128_C(float* a) {
+  const float* c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;

  for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
    k2 = 128 - j2;
-    k1 =  32 - j1;
+    k1 = 32 - j1;
    wkr = 0.5f - c[k1];
    wki = c[j1];
    xr = a[j2 + 0] - a[k2 + 0];
@@ -520,15 +513,15 @@ static void rftfsub_128_C(float *a) {
  }
 }

-static void rftbsub_128_C(float *a) {
-  const float *c = rdft_w + 32;
+static void rftbsub_128_C(float* a) {
+  const float* c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;

  a[1] = -a[1];
  for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
    k2 = 128 - j2;
-    k1 =  32 - j1;
+    k1 = 32 - j1;
    wkr = 0.5f - c[k1];
    wki = c[j1];
    xr = a[j2 + 0] - a[k2 + 0];
@@ -543,11 +536,9 @@ static void rftbsub_128_C(float *a) {
  a[65] = -a[65];
 }

-void aec_rdft_forward_128(float *a) {
-  const int n = 128;
+void aec_rdft_forward_128(float* a) {
  float xi;
-
-  bitrv2_32or128(n, ip + 2, a);
+  bitrv2_128(a);
  cftfsub_128(a);
  rftfsub_128(a);
  xi = a[0] - a[1];
@@ -555,33 +546,44 @@ void aec_rdft_forward_128(float *a) {
  a[1] = xi;
 }

-void aec_rdft_inverse_128(float *a) {
-  const int n = 128;
-
+void aec_rdft_inverse_128(float* a) {
  a[1] = 0.5f * (a[0] - a[1]);
  a[0] -= a[1];
  rftbsub_128(a);
-  bitrv2_32or128(n, ip + 2, a);
+  bitrv2_128(a);
  cftbsub_128(a);
 }

 // code path selection
-rft_sub_128_t cft1st_128;
-rft_sub_128_t cftmdl_128;
-rft_sub_128_t rftfsub_128;
-rft_sub_128_t rftbsub_128;
+RftSub128 cft1st_128;
+RftSub128 cftmdl_128;
+RftSub128 rftfsub_128;
+RftSub128 rftbsub_128;
+RftSub128 cftfsub_128;
+RftSub128 cftbsub_128;
+RftSub128 bitrv2_128;

 void aec_rdft_init(void) {
  cft1st_128 = cft1st_128_C;
  cftmdl_128 = cftmdl_128_C;
  rftfsub_128 = rftfsub_128_C;
  rftbsub_128 = rftbsub_128_C;
+  cftfsub_128 = cftfsub_128_C;
+  cftbsub_128 = cftbsub_128_C;
+  bitrv2_128 = bitrv2_128_C;
+#if defined(WEBRTC_ARCH_X86_FAMILY)
  if (WebRtc_GetCPUInfo(kSSE2)) {
-#if defined(WEBRTC_USE_SSE2)
    aec_rdft_init_sse2();
-#endif
  }
-  // init library constants.
-  makewt_32();
-  makect_32();
+#endif
+#if defined(MIPS_FPU_LE)
+  aec_rdft_init_mips();
+#endif
+#if defined(WEBRTC_HAS_NEON)
+  aec_rdft_init_neon();
+#elif defined(WEBRTC_DETECT_NEON)
+  if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
+    aec_rdft_init_neon();
+  }
+#endif
 }
--- a/webrtc/modules/audio_processing/aec/aec_rdft.h
+++ b/webrtc/modules/audio_processing/aec/aec_rdft.h
@@ -11,6 +11,8 @@
 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
 #define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_

+#include "webrtc/modules/audio_processing/aec/aec_common.h"
+
 // These intrinsics were unavailable before VS 2008.
 // TODO(andrew): move to a common file.
 #if defined(_MSC_VER) && _MSC_VER < 1500
@@ -19,39 +21,41 @@ static __inline __m128 _mm_castsi128_ps(__m128i a) { return *(__m128*)&a; }
 static __inline __m128i _mm_castps_si128(__m128 a) { return *(__m128i*)&a; }
 #endif

-#ifdef _MSC_VER /* visual c++ */
-# define ALIGN16_BEG __declspec(align(16))
-# define ALIGN16_END
-#else /* gcc or icc */
-# define ALIGN16_BEG
-# define ALIGN16_END __attribute__((aligned(16)))
-#endif
-
-// constants shared by all paths (C, SSE2).
-extern float rdft_w[64];
-// constants used by the C path.
-extern float rdft_wk3ri_first[32];
-extern float rdft_wk3ri_second[32];
-// constants used by SSE2 but initialized in C path.
-extern float rdft_wk1r[32];
-extern float rdft_wk2r[32];
-extern float rdft_wk3r[32];
-extern float rdft_wk1i[32];
-extern float rdft_wk2i[32];
-extern float rdft_wk3i[32];
-extern float cftmdl_wk1r[4];
+// Constants shared by all paths (C, SSE2, NEON).
+extern const float rdft_w[64];
+// Constants used by the C path.
+extern const float rdft_wk3ri_first[16];
+extern const float rdft_wk3ri_second[16];
+// Constants used by SSE2 and NEON but initialized in the C path.
+extern ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32];
+extern ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32];
+extern ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32];
+extern ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32];
+extern ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32];
+extern ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32];
+extern ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4];

 // code path selection function pointers
-typedef void (*rft_sub_128_t)(float *a);
-extern rft_sub_128_t rftfsub_128;
-extern rft_sub_128_t rftbsub_128;
-extern rft_sub_128_t cft1st_128;
-extern rft_sub_128_t cftmdl_128;
+typedef void (*RftSub128)(float* a);
+extern RftSub128 rftfsub_128;
+extern RftSub128 rftbsub_128;
+extern RftSub128 cft1st_128;
+extern RftSub128 cftmdl_128;
+extern RftSub128 cftfsub_128;
+extern RftSub128 cftbsub_128;
+extern RftSub128 bitrv2_128;

 // entry points
 void aec_rdft_init(void);
 void aec_rdft_init_sse2(void);
-void aec_rdft_forward_128(float *a);
-void aec_rdft_inverse_128(float *a);
+void aec_rdft_forward_128(float* a);
+void aec_rdft_inverse_128(float* a);
+
+#if defined(MIPS_FPU_LE)
+void aec_rdft_init_mips(void);
+#endif
+#if defined(WEBRTC_DETECT_NEON) || defined(WEBRTC_HAS_NEON)
+void aec_rdft_init_neon(void);
+#endif

 #endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
--- a/webrtc/modules/audio_processing/aec/aec_rdft_mips.c
+++ b/webrtc/modules/audio_processing/aec/aec_rdft_mips.c
--- a/webrtc/modules/audio_processing/aec/aec_rdft_neon.c
+++ b/webrtc/modules/audio_processing/aec/aec_rdft_neon.c
@@ -0,0 +1,355 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * The rdft AEC algorithm, neon version of speed-critical functions.
+ *
+ * Based on the sse2 version.
+ */
+
+
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+
+#include <arm_neon.h>
+
+static const ALIGN16_BEG float ALIGN16_END
+    k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
+
+static void cft1st_128_neon(float* a) {
+  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
+  int j, k2;
+
+  for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
+    float32x4_t a00v = vld1q_f32(&a[j + 0]);
+    float32x4_t a04v = vld1q_f32(&a[j + 4]);
+    float32x4_t a08v = vld1q_f32(&a[j + 8]);
+    float32x4_t a12v = vld1q_f32(&a[j + 12]);
+    float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
+    float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
+    float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
+    float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
+    const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
+    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
+    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
+    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
+    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
+    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
+    float32x4_t x0v = vaddq_f32(a01v, a23v);
+    const float32x4_t x1v = vsubq_f32(a01v, a23v);
+    const float32x4_t x2v = vaddq_f32(a45v, a67v);
+    const float32x4_t x3v = vsubq_f32(a45v, a67v);
+    const float32x4_t x3w = vrev64q_f32(x3v);
+    float32x4_t x0w;
+    a01v = vaddq_f32(x0v, x2v);
+    x0v = vsubq_f32(x0v, x2v);
+    x0w = vrev64q_f32(x0v);
+    a45v = vmulq_f32(wk2rv, x0v);
+    a45v = vmlaq_f32(a45v, wk2iv, x0w);
+    x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
+    x0w = vrev64q_f32(x0v);
+    a23v = vmulq_f32(wk1rv, x0v);
+    a23v = vmlaq_f32(a23v, wk1iv, x0w);
+    x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
+    x0w = vrev64q_f32(x0v);
+    a67v = vmulq_f32(wk3rv, x0v);
+    a67v = vmlaq_f32(a67v, wk3iv, x0w);
+    a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
+    a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
+    a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
+    a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
+    vst1q_f32(&a[j + 0], a00v);
+    vst1q_f32(&a[j + 4], a04v);
+    vst1q_f32(&a[j + 8], a08v);
+    vst1q_f32(&a[j + 12], a12v);
+  }
+}
+
+static void cftmdl_128_neon(float* a) {
+  int j;
+  const int l = 8;
+  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
+  float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
+
+  for (j = 0; j < l; j += 2) {
+    const float32x2_t a_00 = vld1_f32(&a[j + 0]);
+    const float32x2_t a_08 = vld1_f32(&a[j + 8]);
+    const float32x2_t a_32 = vld1_f32(&a[j + 32]);
+    const float32x2_t a_40 = vld1_f32(&a[j + 40]);
+    const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
+    const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
+    const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
+    const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
+    const float32x2_t a_16 = vld1_f32(&a[j + 16]);
+    const float32x2_t a_24 = vld1_f32(&a[j + 24]);
+    const float32x2_t a_48 = vld1_f32(&a[j + 48]);
+    const float32x2_t a_56 = vld1_f32(&a[j + 56]);
+    const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
+    const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
+    const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
+    const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
+    const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
+    const float32x4_t x1_x3_add =
+        vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+    const float32x4_t x1_x3_sub =
+        vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+    const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
+    const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
+    const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
+    const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
+    const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
+    const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
+    const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
+    const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
+    const float32x4_t xx1_rev = vrev64q_f32(xx1);
+    const float32x4_t yy4_rev = vrev64q_f32(yy4);
+
+    vst1_f32(&a[j + 0], vget_low_f32(xx0));
+    vst1_f32(&a[j + 32], vget_high_f32(xx0));
+    vst1_f32(&a[j + 16], vget_low_f32(xx1));
+    vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));
+
+    a[j + 48] = -a[j + 48];
+
+    vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
+    vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
+    vst1_f32(&a[j + 40], vget_low_f32(yy4));
+    vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
+  }
+
+  {
+    const int k = 64;
+    const int k1 = 2;
+    const int k2 = 2 * k1;
+    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
+    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
+    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
+    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
+    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
+    wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
+    for (j = k; j < l + k; j += 2) {
+      const float32x2_t a_00 = vld1_f32(&a[j + 0]);
+      const float32x2_t a_08 = vld1_f32(&a[j + 8]);
+      const float32x2_t a_32 = vld1_f32(&a[j + 32]);
+      const float32x2_t a_40 = vld1_f32(&a[j + 40]);
+      const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
+      const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
+      const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
+      const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
+      const float32x2_t a_16 = vld1_f32(&a[j + 16]);
+      const float32x2_t a_24 = vld1_f32(&a[j + 24]);
+      const float32x2_t a_48 = vld1_f32(&a[j + 48]);
+      const float32x2_t a_56 = vld1_f32(&a[j + 56]);
+      const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
+      const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
+      const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
+      const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
+      const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
+      const float32x4_t x1_x3_add =
+          vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+      const float32x4_t x1_x3_sub =
+          vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+      float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
+      float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
+      float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
+      xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
+      xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
+      xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));
+
+      vst1_f32(&a[j + 0], vget_low_f32(xx));
+      vst1_f32(&a[j + 32], vget_high_f32(xx));
+      vst1_f32(&a[j + 16], vget_low_f32(xx4));
+      vst1_f32(&a[j + 48], vget_high_f32(xx4));
+      vst1_f32(&a[j + 8], vget_low_f32(xx12));
+      vst1_f32(&a[j + 40], vget_high_f32(xx12));
+      vst1_f32(&a[j + 24], vget_low_f32(xx22));
+      vst1_f32(&a[j + 56], vget_high_f32(xx22));
+    }
+  }
+}
+
+__inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
+  // A B C D -> C D A B
+  const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
+  // C D A B -> D C B A
+  return vrev64q_f32(rev);
+}
+
+static void rftfsub_128_neon(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2;
+  const float32x4_t mm_half = vdupq_n_f32(0.5f);
+
+  // Vectorized code (four at once).
+  // Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const float32x4_t c_j1 = vld1q_f32(&c[j1]);          //  1,  2,  3,  4,
+    const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);     // 28, 29, 30, 31,
+    const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);   // 28, 29, 30, 31,
+    const float32x4_t wkr_ = reverse_order_f32x4(wkrt);  // 31, 30, 29, 28,
+    const float32x4_t wki_ = c_j1;                       //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    //   2,   4,   6,   8,   3,   5,   7,   9
+    float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
+    // 120, 122, 124, 126, 121, 123, 125, 127,
+    const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
+    // 126, 124, 122, 120
+    const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
+    // 127, 125, 123, 121
+    const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
+    // Calculate 'x'.
+    const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
+    // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
+    // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr - wki * xi;
+    //    yi = wkr * xi + wki * xr;
+    const float32x4_t a_ = vmulq_f32(wkr_, xr_);
+    const float32x4_t b_ = vmulq_f32(wki_, xi_);
+    const float32x4_t c_ = vmulq_f32(wkr_, xi_);
+    const float32x4_t d_ = vmulq_f32(wki_, xr_);
+    const float32x4_t yr_ = vsubq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t yi_ = vaddq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                                // Update 'a'.
+                                                //    a[j2 + 0] -= yr;
+                                                //    a[j2 + 1] -= yi;
+                                                //    a[k2 + 0] += yr;
+                                                //    a[k2 + 1] -= yi;
+    // 126, 124, 122, 120,
+    const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
+    // 127, 125, 123, 121,
+    const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);
+    // Shuffle in right order and store.
+    const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
+    const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
+    // 124, 125, 126, 127, 120, 121, 122, 123
+    const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
+    //   2,   4,   6,   8,
+    a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
+    //   3,   5,   7,   9,
+    a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);
+    //   2,   3,   4,   5,   6,   7,   8,   9,
+    vst2q_f32(&a[0 + j2], a_j2_p);
+
+    vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
+    vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
+  }
+
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    const int k2 = 128 - j2;
+    const int k1 = 32 - j1;
+    const float wkr = 0.5f - c[k1];
+    const float wki = c[j1];
+    const float xr = a[j2 + 0] - a[k2 + 0];
+    const float xi = a[j2 + 1] + a[k2 + 1];
+    const float yr = wkr * xr - wki * xi;
+    const float yi = wkr * xi + wki * xr;
+    a[j2 + 0] -= yr;
+    a[j2 + 1] -= yi;
+    a[k2 + 0] += yr;
+    a[k2 + 1] -= yi;
+  }
+}
+
+static void rftbsub_128_neon(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2;
+  const float32x4_t mm_half = vdupq_n_f32(0.5f);
+
+  a[1] = -a[1];
+  // Vectorized code (four at once).
+  //    Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const float32x4_t c_j1 = vld1q_f32(&c[j1]);         //  1,  2,  3,  4,
+    const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);    // 28, 29, 30, 31,
+    const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);  // 28, 29, 30, 31,
+    const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
+    const float32x4_t wki_ = c_j1;                      //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    //   2,   4,   6,   8,   3,   5,   7,   9
+    float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
+    // 120, 122, 124, 126, 121, 123, 125, 127,
+    const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
+    // 126, 124, 122, 120
+    const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
+    // 127, 125, 123, 121
+    const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
+    // Calculate 'x'.
+    const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
+    // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
+    // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr - wki * xi;
+    //    yi = wkr * xi + wki * xr;
+    const float32x4_t a_ = vmulq_f32(wkr_, xr_);
+    const float32x4_t b_ = vmulq_f32(wki_, xi_);
+    const float32x4_t c_ = vmulq_f32(wkr_, xi_);
+    const float32x4_t d_ = vmulq_f32(wki_, xr_);
+    const float32x4_t yr_ = vaddq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t yi_ = vsubq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                                // Update 'a'.
+                                                //    a[j2 + 0] -= yr;
+                                                //    a[j2 + 1] -= yi;
+                                                //    a[k2 + 0] += yr;
+                                                //    a[k2 + 1] -= yi;
+    // 126, 124, 122, 120,
+    const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
+    // 127, 125, 123, 121,
+    const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
+    // Shuffle in right order and store.
+    //   2,   3,   4,   5,   6,   7,   8,   9,
+    const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
+    const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
+    // 124, 125, 126, 127, 120, 121, 122, 123
+    const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
+    //   2,   4,   6,   8,
+    a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
+    //   3,   5,   7,   9,
+    a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
+    //   2,   3,   4,   5,   6,   7,   8,   9,
+    vst2q_f32(&a[0 + j2], a_j2_p);
+
+    vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
+    vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
+  }
+
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    const int k2 = 128 - j2;
+    const int k1 = 32 - j1;
+    const float wkr = 0.5f - c[k1];
+    const float wki = c[j1];
+    const float xr = a[j2 + 0] - a[k2 + 0];
+    const float xi = a[j2 + 1] + a[k2 + 1];
+    const float yr = wkr * xr + wki * xi;
+    const float yi = wkr * xi - wki * xr;
+    a[j2 + 0] = a[j2 + 0] - yr;
+    a[j2 + 1] = yi - a[j2 + 1];
+    a[k2 + 0] = yr + a[k2 + 0];
+    a[k2 + 1] = yi - a[k2 + 1];
+  }
+  a[65] = -a[65];
+}
+
+void aec_rdft_init_neon(void) {
+  cft1st_128 = cft1st_128_neon;
+  cftmdl_128 = cftmdl_128_neon;
+  rftfsub_128 = rftfsub_128_neon;
+  rftbsub_128 = rftbsub_128_neon;
+}
+
--- a/webrtc/modules/audio_processing/aec/aec_rdft_sse2.c
+++ b/webrtc/modules/audio_processing/aec/aec_rdft_sse2.c
@@ -8,172 +8,168 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "typedefs.h"
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"

-#if defined(WEBRTC_USE_SSE2)
 #include <emmintrin.h>

-#include "aec_rdft.h"
+static const ALIGN16_BEG float ALIGN16_END
+    k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};

-static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] =
-  {-1.f, 1.f, -1.f, 1.f};
-
-static void cft1st_128_SSE2(float *a) {
+static void cft1st_128_SSE2(float* a) {
  const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
  int j, k2;

  for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
-          __m128 a00v   = _mm_loadu_ps(&a[j +  0]);
-          __m128 a04v   = _mm_loadu_ps(&a[j +  4]);
-          __m128 a08v   = _mm_loadu_ps(&a[j +  8]);
-          __m128 a12v   = _mm_loadu_ps(&a[j + 12]);
-          __m128 a01v   = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1 ,0));
-          __m128 a23v   = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3 ,2));
-          __m128 a45v   = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1 ,0));
-          __m128 a67v   = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3 ,2));
+    __m128 a00v = _mm_loadu_ps(&a[j + 0]);
+    __m128 a04v = _mm_loadu_ps(&a[j + 4]);
+    __m128 a08v = _mm_loadu_ps(&a[j + 8]);
+    __m128 a12v = _mm_loadu_ps(&a[j + 12]);
+    __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
+    __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
+    __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
+    __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));

-    const __m128 wk1rv  = _mm_load_ps(&rdft_wk1r[k2]);
-    const __m128 wk1iv  = _mm_load_ps(&rdft_wk1i[k2]);
-    const __m128 wk2rv  = _mm_load_ps(&rdft_wk2r[k2]);
-    const __m128 wk2iv  = _mm_load_ps(&rdft_wk2i[k2]);
-    const __m128 wk3rv  = _mm_load_ps(&rdft_wk3r[k2]);
-    const __m128 wk3iv  = _mm_load_ps(&rdft_wk3i[k2]);
-          __m128 x0v    = _mm_add_ps(a01v, a23v);
-    const __m128 x1v    = _mm_sub_ps(a01v, a23v);
-    const __m128 x2v    = _mm_add_ps(a45v, a67v);
-    const __m128 x3v    = _mm_sub_ps(a45v, a67v);
-          __m128 x0w;
-                 a01v   = _mm_add_ps(x0v, x2v);
-                 x0v    = _mm_sub_ps(x0v, x2v);
-                 x0w    = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
+    const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
+    const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
+    const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
+    const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
+    const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
+    const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
+    __m128 x0v = _mm_add_ps(a01v, a23v);
+    const __m128 x1v = _mm_sub_ps(a01v, a23v);
+    const __m128 x2v = _mm_add_ps(a45v, a67v);
+    const __m128 x3v = _mm_sub_ps(a45v, a67v);
+    __m128 x0w;
+    a01v = _mm_add_ps(x0v, x2v);
+    x0v = _mm_sub_ps(x0v, x2v);
+    x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
    {
      const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
      const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
-                   a45v   = _mm_add_ps(a45_0v, a45_1v);
+      a45v = _mm_add_ps(a45_0v, a45_1v);
    }
    {
-            __m128 a23_0v, a23_1v;
-      const __m128 x3w    = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0 ,1));
-      const __m128 x3s    = _mm_mul_ps(mm_swap_sign, x3w);
-                   x0v    = _mm_add_ps(x1v, x3s);
-                   x0w    = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
-                   a23_0v = _mm_mul_ps(wk1rv, x0v);
-                   a23_1v = _mm_mul_ps(wk1iv, x0w);
-                   a23v   = _mm_add_ps(a23_0v, a23_1v);
+      __m128 a23_0v, a23_1v;
+      const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
+      const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
+      x0v = _mm_add_ps(x1v, x3s);
+      x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
+      a23_0v = _mm_mul_ps(wk1rv, x0v);
+      a23_1v = _mm_mul_ps(wk1iv, x0w);
+      a23v = _mm_add_ps(a23_0v, a23_1v);

-                   x0v    = _mm_sub_ps(x1v, x3s);
-                   x0w    = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
+      x0v = _mm_sub_ps(x1v, x3s);
+      x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
    }
    {
      const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
      const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
-                   a67v   = _mm_add_ps(a67_0v, a67_1v);
+      a67v = _mm_add_ps(a67_0v, a67_1v);
    }

-                 a00v   = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1 ,0));
-                 a04v   = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1 ,0));
-                 a08v   = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3 ,2));
-                 a12v   = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3 ,2));
-    _mm_storeu_ps(&a[j +  0], a00v);
-    _mm_storeu_ps(&a[j +  4], a04v);
-    _mm_storeu_ps(&a[j +  8], a08v);
+    a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
+    a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
+    a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
+    a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
+    _mm_storeu_ps(&a[j + 0], a00v);
+    _mm_storeu_ps(&a[j + 4], a04v);
+    _mm_storeu_ps(&a[j + 8], a08v);
    _mm_storeu_ps(&a[j + 12], a12v);
  }
 }

-static void cftmdl_128_SSE2(float *a) {
+static void cftmdl_128_SSE2(float* a) {
  const int l = 8;
  const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
  int j0;

  __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
  for (j0 = 0; j0 < l; j0 += 2) {
-    const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 +  0]);
-    const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 +  8]);
+    const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
+    const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
    const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
    const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
-    const __m128  a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
-                                           _mm_castsi128_ps(a_32),
-                                           _MM_SHUFFLE(1, 0, 1 ,0));
-    const __m128  a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
-                                           _mm_castsi128_ps(a_40),
-                                           _MM_SHUFFLE(1, 0, 1 ,0));
-          __m128  x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
-    const __m128  x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
+    const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
+                                          _mm_castsi128_ps(a_32),
+                                          _MM_SHUFFLE(1, 0, 1, 0));
+    const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
+                                          _mm_castsi128_ps(a_40),
+                                          _MM_SHUFFLE(1, 0, 1, 0));
+    __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
+    const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);

    const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
    const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
    const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
    const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
-    const __m128  a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
-                                           _mm_castsi128_ps(a_48),
-                                           _MM_SHUFFLE(1, 0, 1 ,0));
-    const __m128  a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
-                                           _mm_castsi128_ps(a_56),
-                                           _MM_SHUFFLE(1, 0, 1 ,0));
-    const __m128  x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
-    const __m128  x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
+    const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
+                                          _mm_castsi128_ps(a_48),
+                                          _MM_SHUFFLE(1, 0, 1, 0));
+    const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
+                                          _mm_castsi128_ps(a_56),
+                                          _MM_SHUFFLE(1, 0, 1, 0));
+    const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
+    const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);

-    const __m128  xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
-    const __m128  xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);

-    const __m128  x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(
-        _mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1),
-                          _MM_SHUFFLE(2, 3, 0, 1)));
-    const __m128  x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
-    const __m128  x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
-    const __m128  x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+    const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
+        _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
+    const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
+    const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+    const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);

-    const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub,
-                                      _MM_SHUFFLE(2, 2, 2 ,2));
-    const __m128 yy1 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub,
-                                      _MM_SHUFFLE(3, 3, 3 ,3));
+    const __m128 yy0 =
+        _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
+    const __m128 yy1 =
+        _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
    const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
    const __m128 yy3 = _mm_add_ps(yy0, yy2);
    const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);

-    _mm_storel_epi64((__m128i*)&a[j0 +  0], _mm_castps_si128(xx0));
-    _mm_storel_epi64((__m128i*)&a[j0 + 32],
-                     _mm_shuffle_epi32(_mm_castps_si128(xx0),
-                                       _MM_SHUFFLE(3, 2, 3, 2)));
+    _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
+    _mm_storel_epi64(
+        (__m128i*)&a[j0 + 32],
+        _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));

    _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
-    _mm_storel_epi64((__m128i*)&a[j0 + 48],
-                     _mm_shuffle_epi32(_mm_castps_si128(xx1),
-                                       _MM_SHUFFLE(2, 3, 2, 3)));
+    _mm_storel_epi64(
+        (__m128i*)&a[j0 + 48],
+        _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
    a[j0 + 48] = -a[j0 + 48];

-    _mm_storel_epi64((__m128i*)&a[j0 +  8], _mm_castps_si128(x1_x3_add));
+    _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
    _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));

    _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
-    _mm_storel_epi64((__m128i*)&a[j0 + 56],
-                     _mm_shuffle_epi32(_mm_castps_si128(yy4),
-                     _MM_SHUFFLE(2, 3, 2, 3)));
+    _mm_storel_epi64(
+        (__m128i*)&a[j0 + 56],
+        _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
  }

  {
    int k = 64;
    int k1 = 2;
    int k2 = 2 * k1;
-    const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2+0]);
-    const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2+0]);
-    const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2+0]);
-    const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2+0]);
-    const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2+0]);
-                 wk1rv = _mm_load_ps(&rdft_wk1r[k2+0]);
+    const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
+    const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
+    const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
+    const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
+    const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
+    wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
    for (j0 = k; j0 < l + k; j0 += 2) {
-      const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 +  0]);
-      const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 +  8]);
+      const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
+      const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
      const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
      const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
      const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
                                            _mm_castsi128_ps(a_32),
-                                            _MM_SHUFFLE(1, 0, 1 ,0));
+                                            _MM_SHUFFLE(1, 0, 1, 0));
      const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
                                            _mm_castsi128_ps(a_40),
-                                            _MM_SHUFFLE(1, 0, 1 ,0));
-            __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
+                                            _MM_SHUFFLE(1, 0, 1, 0));
+      __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
      const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);

      const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
@@ -182,100 +178,102 @@ static void cftmdl_128_SSE2(float *a) {
      const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
      const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
                                            _mm_castsi128_ps(a_48),
-                                            _MM_SHUFFLE(1, 0, 1 ,0));
+                                            _MM_SHUFFLE(1, 0, 1, 0));
      const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
                                            _mm_castsi128_ps(a_56),
-                                            _MM_SHUFFLE(1, 0, 1 ,0));
+                                            _MM_SHUFFLE(1, 0, 1, 0));
      const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
      const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);

      const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
      const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
-      const __m128 xx2 = _mm_mul_ps(xx1 , wk2rv);
-      const __m128 xx3 = _mm_mul_ps(wk2iv,
-          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1),
-                                             _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
+      const __m128 xx3 =
+          _mm_mul_ps(wk2iv,
+                     _mm_castsi128_ps(_mm_shuffle_epi32(
+                         _mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1))));
      const __m128 xx4 = _mm_add_ps(xx2, xx3);

-      const __m128  x3i0_3r0_3i1_x3r1 =  _mm_castsi128_ps(
-          _mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1),
-                            _MM_SHUFFLE(2, 3, 0, 1)));
-      const __m128  x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
-      const __m128  x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
-      const __m128  x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+      const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
+          _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
+      const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
+      const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+      const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);

      const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
-      const __m128 xx11 = _mm_mul_ps(wk1iv,
+      const __m128 xx11 = _mm_mul_ps(
+          wk1iv,
          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
                                             _MM_SHUFFLE(2, 3, 0, 1))));
      const __m128 xx12 = _mm_add_ps(xx10, xx11);

      const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
-      const __m128 xx21 = _mm_mul_ps(wk3iv,
+      const __m128 xx21 = _mm_mul_ps(
+          wk3iv,
          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
-                           _MM_SHUFFLE(2, 3, 0, 1))));
+                                             _MM_SHUFFLE(2, 3, 0, 1))));
      const __m128 xx22 = _mm_add_ps(xx20, xx21);

-      _mm_storel_epi64((__m128i*)&a[j0 +  0], _mm_castps_si128(xx));
-      _mm_storel_epi64((__m128i*)&a[j0 + 32],
-                         _mm_shuffle_epi32(_mm_castps_si128(xx),
-                                           _MM_SHUFFLE(3, 2, 3, 2)));
+      _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
+      _mm_storel_epi64(
+          (__m128i*)&a[j0 + 32],
+          _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));

      _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
-      _mm_storel_epi64((__m128i*)&a[j0 + 48],
-                        _mm_shuffle_epi32(_mm_castps_si128(xx4),
-                                          _MM_SHUFFLE(3, 2, 3, 2)));
+      _mm_storel_epi64(
+          (__m128i*)&a[j0 + 48],
+          _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));

-      _mm_storel_epi64((__m128i*)&a[j0 +  8], _mm_castps_si128(xx12));
-      _mm_storel_epi64((__m128i*)&a[j0 + 40],
-                       _mm_shuffle_epi32(_mm_castps_si128(xx12),
-                                         _MM_SHUFFLE(3, 2, 3, 2)));
+      _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
+      _mm_storel_epi64(
+          (__m128i*)&a[j0 + 40],
+          _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));

      _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
-      _mm_storel_epi64((__m128i*)&a[j0 + 56],
-                       _mm_shuffle_epi32(_mm_castps_si128(xx22),
-                                         _MM_SHUFFLE(3, 2, 3, 2)));
+      _mm_storel_epi64(
+          (__m128i*)&a[j0 + 56],
+          _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
    }
  }
 }

-static void rftfsub_128_SSE2(float *a) {
-  const float *c = rdft_w + 32;
+static void rftfsub_128_SSE2(float* a) {
+  const float* c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;

-  static const ALIGN16_BEG float ALIGN16_END k_half[4] =
-    {0.5f, 0.5f, 0.5f, 0.5f};
+  static const ALIGN16_BEG float ALIGN16_END
+      k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
  const __m128 mm_half = _mm_load_ps(k_half);

  // Vectorized code (four at once).
  //    Note: commented number are indexes for the first iteration of the loop.
  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
    // Load 'wk'.
-    const __m128 c_j1 = _mm_loadu_ps(&c[     j1]);         //  1,  2,  3,  4,
-    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);         // 28, 29, 30, 31,
-    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);         // 28, 29, 30, 31,
+    const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
+    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
+    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
    const __m128 wkr_ =
-      _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
-    const __m128 wki_ = c_j1;                              //  1,  2,  3,  4,
+        _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
+    const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
    // Load and shuffle 'a'.
-    const __m128 a_j2_0 = _mm_loadu_ps(&a[0   + j2]);  //   2,   3,   4,   5,
-    const __m128 a_j2_4 = _mm_loadu_ps(&a[4   + j2]);  //   6,   7,   8,   9,
+    const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
+    const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
-    const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4,
-                            _MM_SHUFFLE(2, 0, 2 ,0));  //   2,   4,   6,   8,
-    const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4,
-                            _MM_SHUFFLE(3, 1, 3 ,1));  //   3,   5,   7,   9,
-    const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0,
-                            _MM_SHUFFLE(0, 2, 0 ,2));  // 126, 124, 122, 120,
-    const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0,
-                            _MM_SHUFFLE(1, 3, 1 ,3));  // 127, 125, 123, 121,
+    const __m128 a_j2_p0 = _mm_shuffle_ps(
+        a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
+    const __m128 a_j2_p1 = _mm_shuffle_ps(
+        a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
+    const __m128 a_k2_p0 = _mm_shuffle_ps(
+        a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
+    const __m128 a_k2_p1 = _mm_shuffle_ps(
+        a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
    // Calculate 'x'.
    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
-                                               // 2-126, 4-124, 6-122, 8-120,
+    // 2-126, 4-124, 6-122, 8-120,
    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
-                                               // 3-127, 5-125, 7-123, 9-121,
+    // 3-127, 5-125, 7-123, 9-121,
    // Calculate product into 'y'.
    //    yr = wkr * xr - wki * xi;
    //    yi = wkr * xi + wki * xr;
@@ -283,12 +281,12 @@ static void rftfsub_128_SSE2(float *a) {
    const __m128 b_ = _mm_mul_ps(wki_, xi_);
    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
    const __m128 d_ = _mm_mul_ps(wki_, xr_);
-    const __m128 yr_ = _mm_sub_ps(a_, b_);     // 2-126, 4-124, 6-122, 8-120,
-    const __m128 yi_ = _mm_add_ps(c_, d_);     // 3-127, 5-125, 7-123, 9-121,
-    // Update 'a'.
-    //    a[j2 + 0] -= yr;
-    //    a[j2 + 1] -= yi;
-    //    a[k2 + 0] += yr;
+    const __m128 yr_ = _mm_sub_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const __m128 yi_ = _mm_add_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                            // Update 'a'.
+                                            //    a[j2 + 0] -= yr;
+                                            //    a[j2 + 1] -= yi;
+                                            //    a[k2 + 0] += yr;
    //    a[k2 + 1] -= yi;
    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
    const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_);  //   3,   5,   7,   9,
@@ -296,26 +294,26 @@ static void rftfsub_128_SSE2(float *a) {
    const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_);  // 127, 125, 123, 121,
    // Shuffle in right order and store.
    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
-                                                       //   2,   3,   4,   5,
+    //   2,   3,   4,   5,
    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
-                                                       //   6,   7,   8,   9,
+    //   6,   7,   8,   9,
    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
-                                                       // 122, 123, 120, 121,
+    // 122, 123, 120, 121,
    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
-                                                       // 126, 127, 124, 125,
-    const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt,
-                            _MM_SHUFFLE(1, 0, 3 ,2));  // 120, 121, 122, 123,
-    const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt,
-                            _MM_SHUFFLE(1, 0, 3 ,2));  // 124, 125, 126, 127,
-    _mm_storeu_ps(&a[0   + j2], a_j2_0n);
-    _mm_storeu_ps(&a[4   + j2], a_j2_4n);
+    // 126, 127, 124, 125,
+    const __m128 a_k2_0n = _mm_shuffle_ps(
+        a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
+    const __m128 a_k2_4n = _mm_shuffle_ps(
+        a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
+    _mm_storeu_ps(&a[0 + j2], a_j2_0n);
+    _mm_storeu_ps(&a[4 + j2], a_j2_4n);
    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
  }
  // Scalar code for the remaining items.
  for (; j2 < 64; j1 += 1, j2 += 2) {
    k2 = 128 - j2;
-    k1 =  32 - j1;
+    k1 = 32 - j1;
    wkr = 0.5f - c[k1];
    wki = c[j1];
    xr = a[j2 + 0] - a[k2 + 0];
@@ -329,13 +327,13 @@ static void rftfsub_128_SSE2(float *a) {
  }
 }

-static void rftbsub_128_SSE2(float *a) {
-  const float *c = rdft_w + 32;
+static void rftbsub_128_SSE2(float* a) {
+  const float* c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;

-  static const ALIGN16_BEG float ALIGN16_END k_half[4] =
-    {0.5f, 0.5f, 0.5f, 0.5f};
+  static const ALIGN16_BEG float ALIGN16_END
+      k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
  const __m128 mm_half = _mm_load_ps(k_half);

  a[1] = -a[1];
@@ -343,30 +341,30 @@ static void rftbsub_128_SSE2(float *a) {
  //    Note: commented number are indexes for the first iteration of the loop.
  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
    // Load 'wk'.
-    const __m128 c_j1 = _mm_loadu_ps(&c[     j1]);         //  1,  2,  3,  4,
-    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);         // 28, 29, 30, 31,
-    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);         // 28, 29, 30, 31,
+    const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
+    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
+    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
    const __m128 wkr_ =
-      _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
-    const __m128 wki_ = c_j1;                              //  1,  2,  3,  4,
+        _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
+    const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
    // Load and shuffle 'a'.
-    const __m128 a_j2_0 = _mm_loadu_ps(&a[0   + j2]);  //   2,   3,   4,   5,
-    const __m128 a_j2_4 = _mm_loadu_ps(&a[4   + j2]);  //   6,   7,   8,   9,
+    const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
+    const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
-    const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4,
-                            _MM_SHUFFLE(2, 0, 2 ,0));  //   2,   4,   6,   8,
-    const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4,
-                            _MM_SHUFFLE(3, 1, 3 ,1));  //   3,   5,   7,   9,
-    const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0,
-                            _MM_SHUFFLE(0, 2, 0 ,2));  // 126, 124, 122, 120,
-    const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0,
-                            _MM_SHUFFLE(1, 3, 1 ,3));  // 127, 125, 123, 121,
+    const __m128 a_j2_p0 = _mm_shuffle_ps(
+        a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
+    const __m128 a_j2_p1 = _mm_shuffle_ps(
+        a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
+    const __m128 a_k2_p0 = _mm_shuffle_ps(
+        a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
+    const __m128 a_k2_p1 = _mm_shuffle_ps(
+        a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
    // Calculate 'x'.
    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
-                                               // 2-126, 4-124, 6-122, 8-120,
+    // 2-126, 4-124, 6-122, 8-120,
    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
-                                               // 3-127, 5-125, 7-123, 9-121,
+    // 3-127, 5-125, 7-123, 9-121,
    // Calculate product into 'y'.
    //    yr = wkr * xr + wki * xi;
    //    yi = wkr * xi - wki * xr;
@@ -374,12 +372,12 @@ static void rftbsub_128_SSE2(float *a) {
    const __m128 b_ = _mm_mul_ps(wki_, xi_);
    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
    const __m128 d_ = _mm_mul_ps(wki_, xr_);
-    const __m128 yr_ = _mm_add_ps(a_, b_);     // 2-126, 4-124, 6-122, 8-120,
-    const __m128 yi_ = _mm_sub_ps(c_, d_);     // 3-127, 5-125, 7-123, 9-121,
-    // Update 'a'.
-    //    a[j2 + 0] = a[j2 + 0] - yr;
-    //    a[j2 + 1] = yi - a[j2 + 1];
-    //    a[k2 + 0] = yr + a[k2 + 0];
+    const __m128 yr_ = _mm_add_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const __m128 yi_ = _mm_sub_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                            // Update 'a'.
+                                            //    a[j2 + 0] = a[j2 + 0] - yr;
+                                            //    a[j2 + 1] = yi - a[j2 + 1];
+                                            //    a[k2 + 0] = yr + a[k2 + 0];
    //    a[k2 + 1] = yi - a[k2 + 1];
    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
    const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1);  //   3,   5,   7,   9,
@@ -387,26 +385,26 @@ static void rftbsub_128_SSE2(float *a) {
    const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1);  // 127, 125, 123, 121,
    // Shuffle in right order and store.
    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
-                                                       //   2,   3,   4,   5,
+    //   2,   3,   4,   5,
    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
-                                                       //   6,   7,   8,   9,
+    //   6,   7,   8,   9,
    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
-                                                       // 122, 123, 120, 121,
+    // 122, 123, 120, 121,
    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
-                                                       // 126, 127, 124, 125,
-    const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt,
-                            _MM_SHUFFLE(1, 0, 3 ,2));  // 120, 121, 122, 123,
-    const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt,
-                            _MM_SHUFFLE(1, 0, 3 ,2));  // 124, 125, 126, 127,
-    _mm_storeu_ps(&a[0   + j2], a_j2_0n);
-    _mm_storeu_ps(&a[4   + j2], a_j2_4n);
+    // 126, 127, 124, 125,
+    const __m128 a_k2_0n = _mm_shuffle_ps(
+        a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
+    const __m128 a_k2_4n = _mm_shuffle_ps(
+        a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
+    _mm_storeu_ps(&a[0 + j2], a_j2_0n);
+    _mm_storeu_ps(&a[4 + j2], a_j2_4n);
    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
  }
  // Scalar code for the remaining items.
  for (; j2 < 64; j1 += 1, j2 += 2) {
    k2 = 128 - j2;
-    k1 =  32 - j1;
+    k1 = 32 - j1;
    wkr = 0.5f - c[k1];
    wki = c[j1];
    xr = a[j2 + 0] - a[k2 + 0];
@@ -427,5 +425,3 @@ void aec_rdft_init_sse2(void) {
  rftfsub_128 = rftfsub_128_SSE2;
  rftbsub_128 = rftbsub_128_SSE2;
 }
-
-#endif  // WEBRTC_USE_SS2
--- a/webrtc/modules/audio_processing/aec/aec_resampler.c
+++ b/webrtc/modules/audio_processing/aec/aec_resampler.c
@@ -0,0 +1,209 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* Resamples a signal to an arbitrary rate. Used by the AEC to compensate for
+ * clock skew by resampling the farend signal.
+ */
+
+#include "webrtc/modules/audio_processing/aec/aec_resampler.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+
+enum {
+  kEstimateLengthFrames = 400
+};
+
+typedef struct {
+  float buffer[kResamplerBufferSize];
+  float position;
+
+  int deviceSampleRateHz;
+  int skewData[kEstimateLengthFrames];
+  int skewDataIndex;
+  float skewEstimate;
+} AecResampler;
+
+static int EstimateSkew(const int* rawSkew,
+                        int size,
+                        int absLimit,
+                        float* skewEst);
+
+void* WebRtcAec_CreateResampler() {
+  return malloc(sizeof(AecResampler));
+}
+
+int WebRtcAec_InitResampler(void* resampInst, int deviceSampleRateHz) {
+  AecResampler* obj = (AecResampler*)resampInst;
+  memset(obj->buffer, 0, sizeof(obj->buffer));
+  obj->position = 0.0;
+
+  obj->deviceSampleRateHz = deviceSampleRateHz;
+  memset(obj->skewData, 0, sizeof(obj->skewData));
+  obj->skewDataIndex = 0;
+  obj->skewEstimate = 0.0;
+
+  return 0;
+}
+
+void WebRtcAec_FreeResampler(void* resampInst) {
+  AecResampler* obj = (AecResampler*)resampInst;
+  free(obj);
+}
+
+void WebRtcAec_ResampleLinear(void* resampInst,
+                              const float* inspeech,
+                              size_t size,
+                              float skew,
+                              float* outspeech,
+                              size_t* size_out) {
+  AecResampler* obj = (AecResampler*)resampInst;
+
+  float* y;
+  float be, tnew;
+  size_t tn, mm;
+
+  assert(size <= 2 * FRAME_LEN);
+  assert(resampInst != NULL);
+  assert(inspeech != NULL);
+  assert(outspeech != NULL);
+  assert(size_out != NULL);
+
+  // Add new frame data in lookahead
+  memcpy(&obj->buffer[FRAME_LEN + kResamplingDelay],
+         inspeech,
+         size * sizeof(inspeech[0]));
+
+  // Sample rate ratio
+  be = 1 + skew;
+
+  // Loop over input frame
+  mm = 0;
+  y = &obj->buffer[FRAME_LEN];  // Point at current frame
+
+  tnew = be * mm + obj->position;
+  tn = (size_t)tnew;
+
+  while (tn < size) {
+
+    // Interpolation
+    outspeech[mm] = y[tn] + (tnew - tn) * (y[tn + 1] - y[tn]);
+    mm++;
+
+    tnew = be * mm + obj->position;
+    tn = (int)tnew;
+  }
+
+  *size_out = mm;
+  obj->position += (*size_out) * be - size;
+
+  // Shift buffer
+  memmove(obj->buffer,
+          &obj->buffer[size],
+          (kResamplerBufferSize - size) * sizeof(obj->buffer[0]));
+}
+
+int WebRtcAec_GetSkew(void* resampInst, int rawSkew, float* skewEst) {
+  AecResampler* obj = (AecResampler*)resampInst;
+  int err = 0;
+
+  if (obj->skewDataIndex < kEstimateLengthFrames) {
+    obj->skewData[obj->skewDataIndex] = rawSkew;
+    obj->skewDataIndex++;
+  } else if (obj->skewDataIndex == kEstimateLengthFrames) {
+    err = EstimateSkew(
+        obj->skewData, kEstimateLengthFrames, obj->deviceSampleRateHz, skewEst);
+    obj->skewEstimate = *skewEst;
+    obj->skewDataIndex++;
+  } else {
+    *skewEst = obj->skewEstimate;
+  }
+
+  return err;
+}
+
+int EstimateSkew(const int* rawSkew,
+                 int size,
+                 int deviceSampleRateHz,
+                 float* skewEst) {
+  const int absLimitOuter = (int)(0.04f * deviceSampleRateHz);
+  const int absLimitInner = (int)(0.0025f * deviceSampleRateHz);
+  int i = 0;
+  int n = 0;
+  float rawAvg = 0;
+  float err = 0;
+  float rawAbsDev = 0;
+  int upperLimit = 0;
+  int lowerLimit = 0;
+  float cumSum = 0;
+  float x = 0;
+  float x2 = 0;
+  float y = 0;
+  float xy = 0;
+  float xAvg = 0;
+  float denom = 0;
+  float skew = 0;
+
+  *skewEst = 0;  // Set in case of error below.
+  for (i = 0; i < size; i++) {
+    if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
+      n++;
+      rawAvg += rawSkew[i];
+    }
+  }
+
+  if (n == 0) {
+    return -1;
+  }
+  assert(n > 0);
+  rawAvg /= n;
+
+  for (i = 0; i < size; i++) {
+    if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
+      err = rawSkew[i] - rawAvg;
+      rawAbsDev += err >= 0 ? err : -err;
+    }
+  }
+  assert(n > 0);
+  rawAbsDev /= n;
+  upperLimit = (int)(rawAvg + 5 * rawAbsDev + 1);  // +1 for ceiling.
+  lowerLimit = (int)(rawAvg - 5 * rawAbsDev - 1);  // -1 for floor.
+
+  n = 0;
+  for (i = 0; i < size; i++) {
+    if ((rawSkew[i] < absLimitInner && rawSkew[i] > -absLimitInner) ||
+        (rawSkew[i] < upperLimit && rawSkew[i] > lowerLimit)) {
+      n++;
+      cumSum += rawSkew[i];
+      x += n;
+      x2 += n * n;
+      y += cumSum;
+      xy += n * cumSum;
+    }
+  }
+
+  if (n == 0) {
+    return -1;
+  }
+  assert(n > 0);
+  xAvg = x / n;
+  denom = x2 - xAvg * x;
+
+  if (denom != 0) {
+    skew = (xy - xAvg * y) / denom;
+  }
+
+  *skewEst = skew;
+  return 0;
+}
--- a/webrtc/modules/audio_processing/aec/aec_resampler.h
+++ b/webrtc/modules/audio_processing/aec/aec_resampler.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
+
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+
+enum {
+  kResamplingDelay = 1
+};
+enum {
+  kResamplerBufferSize = FRAME_LEN * 4
+};
+
+// Unless otherwise specified, functions return 0 on success and -1 on error.
+void* WebRtcAec_CreateResampler();  // Returns NULL on error.
+int WebRtcAec_InitResampler(void* resampInst, int deviceSampleRateHz);
+void WebRtcAec_FreeResampler(void* resampInst);
+
+// Estimates skew from raw measurement.
+int WebRtcAec_GetSkew(void* resampInst, int rawSkew, float* skewEst);
+
+// Resamples input using linear interpolation.
+void WebRtcAec_ResampleLinear(void* resampInst,
+                              const float* inspeech,
+                              size_t size,
+                              float skew,
+                              float* outspeech,
+                              size_t* size_out);
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
--- a/webrtc/modules/audio_processing/aec/echo_cancellation.c
+++ b/webrtc/modules/audio_processing/aec/echo_cancellation.c
--- a/webrtc/modules/audio_processing/aec/echo_cancellation_internal.h
+++ b/webrtc/modules/audio_processing/aec/echo_cancellation_internal.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
+
+#include "webrtc/common_audio/ring_buffer.h"
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+
+typedef struct {
+  int delayCtr;
+  int sampFreq;
+  int splitSampFreq;
+  int scSampFreq;
+  float sampFactor;  // scSampRate / sampFreq
+  short skewMode;
+  int bufSizeStart;
+  int knownDelay;
+  int rate_factor;
+
+  short initFlag;  // indicates if AEC has been initialized
+
+  // Variables used for averaging far end buffer size
+  short counter;
+  int sum;
+  short firstVal;
+  short checkBufSizeCtr;
+
+  // Variables used for delay shifts
+  short msInSndCardBuf;
+  short filtDelay;  // Filtered delay estimate.
+  int timeForDelayChange;
+  int startup_phase;
+  int checkBuffSize;
+  short lastDelayDiff;
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+  FILE* bufFile;
+  FILE* delayFile;
+  FILE* skewFile;
+#endif
+
+  // Structures
+  void* resampler;
+
+  int skewFrCtr;
+  int resample;  // if the skew is small enough we don't resample
+  int highSkewCtr;
+  float skew;
+
+  RingBuffer* far_pre_buf;  // Time domain far-end pre-buffer.
+
+  int lastError;
+
+  int farend_started;
+
+  AecCore* aec;
+} Aec;
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
--- a/webrtc/modules/audio_processing/aec/include/echo_cancellation.h
+++ b/webrtc/modules/audio_processing/aec/include/echo_cancellation.h
@@ -0,0 +1,245 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
+
+#include <stddef.h>
+
+#include "webrtc/typedefs.h"
+
+// Errors
+#define AEC_UNSPECIFIED_ERROR 12000
+#define AEC_UNSUPPORTED_FUNCTION_ERROR 12001
+#define AEC_UNINITIALIZED_ERROR 12002
+#define AEC_NULL_POINTER_ERROR 12003
+#define AEC_BAD_PARAMETER_ERROR 12004
+
+// Warnings
+#define AEC_BAD_PARAMETER_WARNING 12050
+
+enum {
+  kAecNlpConservative = 0,
+  kAecNlpModerate,
+  kAecNlpAggressive
+};
+
+enum {
+  kAecFalse = 0,
+  kAecTrue
+};
+
+typedef struct {
+  int16_t nlpMode;      // default kAecNlpModerate
+  int16_t skewMode;     // default kAecFalse
+  int16_t metricsMode;  // default kAecFalse
+  int delay_logging;    // default kAecFalse
+  // float realSkew;
+} AecConfig;
+
+typedef struct {
+  int instant;
+  int average;
+  int max;
+  int min;
+} AecLevel;
+
+typedef struct {
+  AecLevel rerl;
+  AecLevel erl;
+  AecLevel erle;
+  AecLevel aNlp;
+} AecMetrics;
+
+struct AecCore;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Allocates the memory needed by the AEC. The memory needs to be initialized
+ * separately using the WebRtcAec_Init() function. Returns a pointer to the
+ * object or NULL on error.
+ */
+void* WebRtcAec_Create();
+
+/*
+ * This function releases the memory allocated by WebRtcAec_Create().
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*        aecInst         Pointer to the AEC instance
+ */
+void WebRtcAec_Free(void* aecInst);
+
+/*
+ * Initializes an AEC instance.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*          aecInst       Pointer to the AEC instance
+ * int32_t        sampFreq      Sampling frequency of data
+ * int32_t        scSampFreq    Soundcard sampling frequency
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int32_t        return        0: OK
+ *                             -1: error
+ */
+int32_t WebRtcAec_Init(void* aecInst, int32_t sampFreq, int32_t scSampFreq);
+
+/*
+ * Inserts an 80 or 160 sample block of data into the farend buffer.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*          aecInst       Pointer to the AEC instance
+ * const float*   farend        In buffer containing one frame of
+ *                              farend signal for L band
+ * int16_t        nrOfSamples   Number of samples in farend buffer
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int32_t        return        0: OK
+ *                             -1: error
+ */
+int32_t WebRtcAec_BufferFarend(void* aecInst,
+                               const float* farend,
+                               size_t nrOfSamples);
+
+/*
+ * Runs the echo canceller on an 80 or 160 sample blocks of data.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*         aecInst        Pointer to the AEC instance
+ * float* const* nearend        In buffer containing one frame of
+ *                              nearend+echo signal for each band
+ * int           num_bands      Number of bands in nearend buffer
+ * int16_t       nrOfSamples    Number of samples in nearend buffer
+ * int16_t       msInSndCardBuf Delay estimate for sound card and
+ *                              system buffers
+ * int16_t       skew           Difference between number of samples played
+ *                              and recorded at the soundcard (for clock skew
+ *                              compensation)
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * float* const* out            Out buffer, one frame of processed nearend
+ *                              for each band
+ * int32_t       return         0: OK
+ *                             -1: error
+ */
+int32_t WebRtcAec_Process(void* aecInst,
+                          const float* const* nearend,
+                          size_t num_bands,
+                          float* const* out,
+                          size_t nrOfSamples,
+                          int16_t msInSndCardBuf,
+                          int32_t skew);
+
+/*
+ * This function enables the user to set certain parameters on-the-fly.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*          handle        Pointer to the AEC instance
+ * AecConfig      config        Config instance that contains all
+ *                              properties to be set
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int            return         0: OK
+ *                              -1: error
+ */
+int WebRtcAec_set_config(void* handle, AecConfig config);
+
+/*
+ * Gets the current echo status of the nearend signal.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*          handle        Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int*           status        0: Almost certainly nearend single-talk
+ *                              1: Might not be neared single-talk
+ * int            return         0: OK
+ *                              -1: error
+ */
+int WebRtcAec_get_echo_status(void* handle, int* status);
+
+/*
+ * Gets the current echo metrics for the session.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*          handle        Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * AecMetrics*    metrics       Struct which will be filled out with the
+ *                              current echo metrics.
+ * int            return         0: OK
+ *                              -1: error
+ */
+int WebRtcAec_GetMetrics(void* handle, AecMetrics* metrics);
+
+/*
+ * Gets the current delay metrics for the session.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*   handle               Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int*    median               Delay median value.
+ * int*    std                  Delay standard deviation.
+ * float*  fraction_poor_delays Fraction of the delay estimates that may
+ *                              cause the AEC to perform poorly.
+ *
+ * int     return                0: OK
+ *                              -1: error
+ */
+int WebRtcAec_GetDelayMetrics(void* handle,
+                              int* median,
+                              int* std,
+                              float* fraction_poor_delays);
+
+/*
+ * Gets the last error code.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*          aecInst       Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int32_t        return        11000-11100: error code
+ */
+int32_t WebRtcAec_get_error_code(void* aecInst);
+
+// Returns a pointer to the low level AEC handle.
+//
+// Input:
+//  - handle                    : Pointer to the AEC instance.
+//
+// Return value:
+//  - AecCore pointer           : NULL for error.
+//
+struct AecCore* WebRtcAec_aec_core(void* handle);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
--- a/webrtc/modules/audio_processing/aec/interface/echo_cancellation.h
+++ b/webrtc/modules/audio_processing/aec/interface/echo_cancellation.h
@@ -1,278 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_INTERFACE_ECHO_CANCELLATION_H_
-#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_INTERFACE_ECHO_CANCELLATION_H_
-
-#include "typedefs.h"
-
-// Errors
-#define AEC_UNSPECIFIED_ERROR           12000
-#define AEC_UNSUPPORTED_FUNCTION_ERROR  12001
-#define AEC_UNINITIALIZED_ERROR         12002
-#define AEC_NULL_POINTER_ERROR          12003
-#define AEC_BAD_PARAMETER_ERROR         12004
-
-// Warnings
-#define AEC_BAD_PARAMETER_WARNING       12050
-
-enum {
-    kAecNlpConservative = 0,
-    kAecNlpModerate,
-    kAecNlpAggressive
-};
-
-enum {
-    kAecFalse = 0,
-    kAecTrue
-};
-
-typedef struct {
-    WebRtc_Word16 nlpMode;        // default kAecNlpModerate
-    WebRtc_Word16 skewMode;       // default kAecFalse
-    WebRtc_Word16 metricsMode;    // default kAecFalse
-    int delay_logging;            // default kAecFalse
-    //float realSkew;
-} AecConfig;
-
-typedef struct {
-    WebRtc_Word16 instant;
-    WebRtc_Word16 average;
-    WebRtc_Word16 max;
-    WebRtc_Word16 min;
-} AecLevel;
-
-typedef struct {
-    AecLevel rerl;
-    AecLevel erl;
-    AecLevel erle;
-    AecLevel aNlp;
-} AecMetrics;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Allocates the memory needed by the AEC. The memory needs to be initialized
- * separately using the WebRtcAec_Init() function.
- *
- * Inputs                       Description
- * -------------------------------------------------------------------
- * void **aecInst               Pointer to the AEC instance to be created
- *                              and initialized
- *
- * Outputs                      Description
- * -------------------------------------------------------------------
- * WebRtc_Word32 return          0: OK
- *                              -1: error
- */
-WebRtc_Word32 WebRtcAec_Create(void **aecInst);
-
-/*
- * This function releases the memory allocated by WebRtcAec_Create().
- *
- * Inputs                       Description
- * -------------------------------------------------------------------
- * void         *aecInst        Pointer to the AEC instance
- *
- * Outputs                      Description
- * -------------------------------------------------------------------
- * WebRtc_Word32  return         0: OK
- *                              -1: error
- */
-WebRtc_Word32 WebRtcAec_Free(void *aecInst);
-
-/*
- * Initializes an AEC instance.
- *
- * Inputs                       Description
- * -------------------------------------------------------------------
- * void           *aecInst      Pointer to the AEC instance
- * WebRtc_Word32  sampFreq      Sampling frequency of data
- * WebRtc_Word32  scSampFreq    Soundcard sampling frequency
- *
- * Outputs                      Description
- * -------------------------------------------------------------------
- * WebRtc_Word32 return          0: OK
- *                              -1: error
- */
-WebRtc_Word32 WebRtcAec_Init(void *aecInst,
-                             WebRtc_Word32 sampFreq,
-                             WebRtc_Word32 scSampFreq);
-
-/*
- * Inserts an 80 or 160 sample block of data into the farend buffer.
- *
- * Inputs                       Description
- * -------------------------------------------------------------------
- * void           *aecInst      Pointer to the AEC instance
- * WebRtc_Word16  *farend       In buffer containing one frame of
- *                              farend signal for L band
- * WebRtc_Word16  nrOfSamples   Number of samples in farend buffer
- *
- * Outputs                      Description
- * -------------------------------------------------------------------
- * WebRtc_Word32  return         0: OK
- *                              -1: error
- */
-WebRtc_Word32 WebRtcAec_BufferFarend(void *aecInst,
-                                     const WebRtc_Word16 *farend,
-                                     WebRtc_Word16 nrOfSamples);
-
-/*
- * Runs the echo canceller on an 80 or 160 sample blocks of data.
- *
- * Inputs                       Description
- * -------------------------------------------------------------------
- * void          *aecInst       Pointer to the AEC instance
- * WebRtc_Word16 *nearend       In buffer containing one frame of
- *                              nearend+echo signal for L band
- * WebRtc_Word16 *nearendH      In buffer containing one frame of
- *                              nearend+echo signal for H band
- * WebRtc_Word16 nrOfSamples    Number of samples in nearend buffer
- * WebRtc_Word16 msInSndCardBuf Delay estimate for sound card and
- *                              system buffers
- * WebRtc_Word16 skew           Difference between number of samples played
- *                              and recorded at the soundcard (for clock skew
- *                              compensation)
- *
- * Outputs                      Description
- * -------------------------------------------------------------------
- * WebRtc_Word16  *out          Out buffer, one frame of processed nearend
- *                              for L band
- * WebRtc_Word16  *outH         Out buffer, one frame of processed nearend
- *                              for H band
- * WebRtc_Word32  return         0: OK
- *                              -1: error
- */
-WebRtc_Word32 WebRtcAec_Process(void *aecInst,
-                                const WebRtc_Word16 *nearend,
-                                const WebRtc_Word16 *nearendH,
-                                WebRtc_Word16 *out,
-                                WebRtc_Word16 *outH,
-                                WebRtc_Word16 nrOfSamples,
-                                WebRtc_Word16 msInSndCardBuf,
-                                WebRtc_Word32 skew);
-
-/*
- * This function enables the user to set certain parameters on-the-fly.
- *
- * Inputs                       Description
- * -------------------------------------------------------------------
- * void           *aecInst      Pointer to the AEC instance
- * AecConfig      config        Config instance that contains all
- *                              properties to be set
- *
- * Outputs                      Description
- * -------------------------------------------------------------------
- * WebRtc_Word32  return         0: OK
- *                              -1: error
- */
-WebRtc_Word32 WebRtcAec_set_config(void *aecInst, AecConfig config);
-
-/*
- * Gets the on-the-fly paramters.
- *
- * Inputs                       Description
- * -------------------------------------------------------------------
- * void           *aecInst      Pointer to the AEC instance
- *
- * Outputs                      Description
- * -------------------------------------------------------------------
- * AecConfig      *config       Pointer to the config instance that
- *                              all properties will be written to
- * WebRtc_Word32  return         0: OK
- *                              -1: error
- */
-WebRtc_Word32 WebRtcAec_get_config(void *aecInst, AecConfig *config);
-
-/*
- * Gets the current echo status of the nearend signal.
- *
- * Inputs                       Description
- * -------------------------------------------------------------------
- * void           *aecInst      Pointer to the AEC instance
- *
- * Outputs                      Description
- * -------------------------------------------------------------------
- * WebRtc_Word16  *status       0: Almost certainly nearend single-talk
- *                              1: Might not be neared single-talk
- * WebRtc_Word32  return         0: OK
- *                              -1: error
- */
-WebRtc_Word32 WebRtcAec_get_echo_status(void *aecInst, WebRtc_Word16 *status);
-
-/*
- * Gets the current echo metrics for the session.
- *
- * Inputs                       Description
- * -------------------------------------------------------------------
- * void           *aecInst      Pointer to the AEC instance
- *
- * Outputs                      Description
- * -------------------------------------------------------------------
- * AecMetrics     *metrics      Struct which will be filled out with the
- *                              current echo metrics.
- * WebRtc_Word32  return         0: OK
- *                              -1: error
- */
-WebRtc_Word32 WebRtcAec_GetMetrics(void *aecInst, AecMetrics *metrics);
-
-/*
- * Gets the current delay metrics for the session.
- *
- * Inputs                       Description
- * -------------------------------------------------------------------
- * void*      handle            Pointer to the AEC instance
- *
- * Outputs                      Description
- * -------------------------------------------------------------------
- * int*       median            Delay median value.
- * int*       std               Delay standard deviation.
- *
- * int        return             0: OK
- *                              -1: error
- */
-int WebRtcAec_GetDelayMetrics(void* handle, int* median, int* std);
-
-/*
- * Gets the last error code.
- *
- * Inputs                       Description
- * -------------------------------------------------------------------
- * void           *aecInst      Pointer to the AEC instance
- *
- * Outputs                      Description
- * -------------------------------------------------------------------
- * WebRtc_Word32  return        11000-11100: error code
- */
-WebRtc_Word32 WebRtcAec_get_error_code(void *aecInst);
-
-/*
- * Gets a version string.
- *
- * Inputs                       Description
- * -------------------------------------------------------------------
- * char           *versionStr   Pointer to a string array
- * WebRtc_Word16  len           The maximum length of the string
- *
- * Outputs                      Description
- * -------------------------------------------------------------------
- * WebRtc_Word8   *versionStr   Pointer to a string array
- * WebRtc_Word32  return         0: OK
- *                              -1: error
- */
-WebRtc_Word32 WebRtcAec_get_version(WebRtc_Word8 *versionStr, WebRtc_Word16 len);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  /* WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_INTERFACE_ECHO_CANCELLATION_H_ */
--- a/webrtc/modules/audio_processing/aec/resampler.c
+++ b/webrtc/modules/audio_processing/aec/resampler.c
@@ -1,233 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-/* Resamples a signal to an arbitrary rate. Used by the AEC to compensate for clock
- * skew by resampling the farend signal.
- */
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-
-#include "resampler.h"
-#include "aec_core.h"
-
-enum { kFrameBufferSize = FRAME_LEN * 4 };
-enum { kEstimateLengthFrames = 400 };
-
-typedef struct {
-    short buffer[kFrameBufferSize];
-    float position;
-
-    int deviceSampleRateHz;
-    int skewData[kEstimateLengthFrames];
-    int skewDataIndex;
-    float skewEstimate;
-} resampler_t;
-
-static int EstimateSkew(const int* rawSkew,
-                        int size,
-                        int absLimit,
-                        float *skewEst);
-
-int WebRtcAec_CreateResampler(void **resampInst)
-{
-    resampler_t *obj = malloc(sizeof(resampler_t));
-    *resampInst = obj;
-    if (obj == NULL) {
-        return -1;
-    }
-
-    return 0;
-}
-
-int WebRtcAec_InitResampler(void *resampInst, int deviceSampleRateHz)
-{
-    resampler_t *obj = (resampler_t*) resampInst;
-    memset(obj->buffer, 0, sizeof(obj->buffer));
-    obj->position = 0.0;
-
-    obj->deviceSampleRateHz = deviceSampleRateHz;
-    memset(obj->skewData, 0, sizeof(obj->skewData));
-    obj->skewDataIndex = 0;
-    obj->skewEstimate = 0.0;
-
-    return 0;
-}
-
-int WebRtcAec_FreeResampler(void *resampInst)
-{
-    resampler_t *obj = (resampler_t*) resampInst;
-    free(obj);
-
-    return 0;
-}
-
-int WebRtcAec_ResampleLinear(void *resampInst,
-                             const short *inspeech,
-                             int size,
-                             float skew,
-                             short *outspeech)
-{
-    resampler_t *obj = (resampler_t*) resampInst;
-
-    short *y;
-    float be, tnew, interp;
-    int tn, outsize, mm;
-
-    if (size < 0 || size > 2 * FRAME_LEN) {
-        return -1;
-    }
-
-    // Add new frame data in lookahead
-    memcpy(&obj->buffer[FRAME_LEN + kResamplingDelay],
-           inspeech,
-           size * sizeof(short));
-
-    // Sample rate ratio
-    be = 1 + skew;
-
-    // Loop over input frame
-    mm = 0;
-    y = &obj->buffer[FRAME_LEN]; // Point at current frame
-
-    tnew = be * mm + obj->position;
-    tn = (int) tnew;
-
-    while (tn < size) {
-
-        // Interpolation
-        interp = y[tn] + (tnew - tn) * (y[tn+1] - y[tn]);
-
-        if (interp > 32767) {
-            interp = 32767;
-        }
-        else if (interp < -32768) {
-            interp = -32768;
-        }
-
-        outspeech[mm] = (short) interp;
-        mm++;
-
-        tnew = be * mm + obj->position;
-        tn = (int) tnew;
-    }
-
-    outsize = mm;
-    obj->position += outsize * be - size;
-
-    // Shift buffer
-    memmove(obj->buffer,
-            &obj->buffer[size],
-            (kFrameBufferSize - size) * sizeof(short));
-
-    return outsize;
-}
-
-int WebRtcAec_GetSkew(void *resampInst, int rawSkew, float *skewEst)
-{
-    resampler_t *obj = (resampler_t*)resampInst;
-    int err = 0;
-
-    if (obj->skewDataIndex < kEstimateLengthFrames) {
-        obj->skewData[obj->skewDataIndex] = rawSkew;
-        obj->skewDataIndex++;
-    }
-    else if (obj->skewDataIndex == kEstimateLengthFrames) {
-        err = EstimateSkew(obj->skewData,
-                           kEstimateLengthFrames,
-                           obj->deviceSampleRateHz,
-                           skewEst);
-        obj->skewEstimate = *skewEst;
-        obj->skewDataIndex++;
-    }
-    else {
-        *skewEst = obj->skewEstimate;
-    }
-
-    return err;
-}
-
-int EstimateSkew(const int* rawSkew,
-                 int size,
-                 int deviceSampleRateHz,
-                 float *skewEst)
-{
-    const int absLimitOuter = (int)(0.04f * deviceSampleRateHz);
-    const int absLimitInner = (int)(0.0025f * deviceSampleRateHz);
-    int i = 0;
-    int n = 0;
-    float rawAvg = 0;
-    float err = 0;
-    float rawAbsDev = 0;
-    int upperLimit = 0;
-    int lowerLimit = 0;
-    float cumSum = 0;
-    float x = 0;
-    float x2 = 0;
-    float y = 0;
-    float xy = 0;
-    float xAvg = 0;
-    float denom = 0;
-    float skew = 0;
-
-    *skewEst = 0; // Set in case of error below.
-    for (i = 0; i < size; i++) {
-      if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
-        n++;
-        rawAvg += rawSkew[i];
-      }
-    }
-
-    if (n == 0) {
-      return -1;
-    }
-    assert(n > 0);
-    rawAvg /= n;
-
-    for (i = 0; i < size; i++) {
-      if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
-        err = rawSkew[i] - rawAvg;
-        rawAbsDev += err >= 0 ? err : -err;
-      }
-    }
-    assert(n > 0);
-    rawAbsDev /= n;
-    upperLimit = (int)(rawAvg + 5 * rawAbsDev + 1); // +1 for ceiling.
-    lowerLimit = (int)(rawAvg - 5 * rawAbsDev - 1); // -1 for floor.
-
-    n = 0;
-    for (i = 0; i < size; i++) {
-        if ((rawSkew[i] < absLimitInner && rawSkew[i] > -absLimitInner) ||
-            (rawSkew[i] < upperLimit && rawSkew[i] > lowerLimit)) {
-            n++;
-            cumSum += rawSkew[i];
-            x += n;
-            x2 += n*n;
-            y += cumSum;
-            xy += n * cumSum;
-        }
-    }
-
-    if (n == 0) {
-      return -1;
-    }
-    assert(n > 0);
-    xAvg = x / n;
-    denom = x2 - xAvg*x;
-
-    if (denom != 0) {
-        skew = (xy - xAvg*y) / denom;
-    }
-
-    *skewEst = skew;
-    return 0;
-}
--- a/webrtc/modules/audio_processing/aec/resampler.h
+++ b/webrtc/modules/audio_processing/aec/resampler.h
@@ -1,32 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_RESAMPLER_H_
-#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_RESAMPLER_H_
-
-enum { kResamplingDelay = 1 };
-
-// Unless otherwise specified, functions return 0 on success and -1 on error
-int WebRtcAec_CreateResampler(void **resampInst);
-int WebRtcAec_InitResampler(void *resampInst, int deviceSampleRateHz);
-int WebRtcAec_FreeResampler(void *resampInst);
-
-// Estimates skew from raw measurement.
-int WebRtcAec_GetSkew(void *resampInst, int rawSkew, float *skewEst);
-
-// Resamples input using linear interpolation.
-// Returns size of resampled array.
-int WebRtcAec_ResampleLinear(void *resampInst,
-                             const short *inspeech,
-                             int size,
-                             float skew,
-                             short *outspeech);
-
-#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_RESAMPLER_H_