Update common_audio

Corresponds to upstream commit 524e9b043e7e86fd72353b987c9d5f6a1ebf83e1 Update notes: * Moved src/ to webrtc/ to easily diff against the third_party/webrtc in the chromium tree * ARM/NEON/MIPS support is not yet hooked up * Tests have not been copied
2015-10-13 12:16:16 +05:30
parent 9413986e79
commit c4fb4e38de
230 changed files with 11201 additions and 8656 deletions
--- a/webrtc/modules/audio_processing/aec/Makefile.am
+++ b/webrtc/modules/audio_processing/aec/Makefile.am
@@ -0,0 +1,16 @@
+noinst_LTLIBRARIES = libaec.la
+
+libaec_la_SOURCES = interface/echo_cancellation.h \
+		    echo_cancellation.c \
+		    aec_core.h \
+		    aec_core.c \
+		    aec_core_sse2.c \
+		    aec_rdft.h \
+		    aec_rdft.c \
+		    aec_rdft_sse2.c \
+		    resampler.h \
+		    resampler.c
+libaec_la_CFLAGS = $(AM_CFLAGS) $(COMMON_CFLAGS) \
+		   -I$(top_srcdir)/src/common_audio/signal_processing_library/main/interface \
+		   -I$(top_srcdir)/src/system_wrappers/interface \
+		   -I$(top_srcdir)/src/modules/audio_processing/utility
--- a/webrtc/modules/audio_processing/aec/aec.gypi
+++ b/webrtc/modules/audio_processing/aec/aec.gypi
@@ -0,0 +1,40 @@
+# Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+  'targets': [
+    {
+      'target_name': 'aec',
+      'type': '<(library)',
+      'dependencies': [
+        '<(webrtc_root)/common_audio/common_audio.gyp:spl',
+        'apm_util'
+      ],
+      'include_dirs': [
+        'interface',
+      ],
+      'direct_dependent_settings': {
+        'include_dirs': [
+          'interface',
+        ],
+      },
+      'sources': [
+        'interface/echo_cancellation.h',
+        'echo_cancellation.c',
+        'aec_core.h',
+        'aec_core.c',
+        'aec_core_sse2.c',
+        'aec_rdft.h',
+        'aec_rdft.c',
+        'aec_rdft_sse2.c',
+        'resampler.h',
+        'resampler.c',
+      ],
+    },
+  ],
+}
--- a/webrtc/modules/audio_processing/aec/aec_core.c
+++ b/webrtc/modules/audio_processing/aec/aec_core.c
--- a/webrtc/modules/audio_processing/aec/aec_core.h
+++ b/webrtc/modules/audio_processing/aec/aec_core.h
@@ -0,0 +1,181 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Specifies the interface for the AEC core.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
+
+#include <stdio.h>
+
+#include "signal_processing_library.h"
+#include "typedefs.h"
+
+//#define AEC_DEBUG // for recording files
+
+#define FRAME_LEN 80
+#define PART_LEN 64 // Length of partition
+#define PART_LEN1 (PART_LEN + 1) // Unique fft coefficients
+#define PART_LEN2 (PART_LEN * 2) // Length of partition * 2
+#define NR_PART 12 // Number of partitions
+#define FILT_LEN (PART_LEN * NR_PART) // Filter length
+#define FILT_LEN2 (FILT_LEN * 2) // Double filter length
+#define FAR_BUF_LEN (FILT_LEN2 * 2)
+#define PREF_BAND_SIZE 24
+
+#define BLOCKL_MAX FRAME_LEN
+// Maximum delay in fixed point delay estimator, used for logging
+enum {kMaxDelay = 100};
+
+typedef float complex_t[2];
+// For performance reasons, some arrays of complex numbers are replaced by twice
+// as long arrays of float, all the real parts followed by all the imaginary
+// ones (complex_t[SIZE] -> float[2][SIZE]). This allows SIMD optimizations and
+// is better than two arrays (one for the real parts and one for the imaginary
+// parts) as this other way would require two pointers instead of one and cause
+// extra register spilling. This also allows the offsets to be calculated at
+// compile time.
+
+// Metrics
+enum {offsetLevel = -100};
+
+typedef struct {
+    float sfrsum;
+    int sfrcounter;
+    float framelevel;
+    float frsum;
+    int frcounter;
+    float minlevel;
+    float averagelevel;
+} power_level_t;
+
+typedef struct {
+    float instant;
+    float average;
+    float min;
+    float max;
+    float sum;
+    float hisum;
+    float himean;
+    int counter;
+    int hicounter;
+} stats_t;
+
+typedef struct {
+    int farBufWritePos, farBufReadPos;
+
+    int knownDelay;
+    int inSamples, outSamples;
+    int delayEstCtr;
+
+    void *farFrBuf, *nearFrBuf, *outFrBuf;
+
+    void *nearFrBufH;
+    void *outFrBufH;
+
+    float xBuf[PART_LEN2]; // farend
+    float dBuf[PART_LEN2]; // nearend
+    float eBuf[PART_LEN2]; // error
+
+    float dBufH[PART_LEN2]; // nearend
+
+    float xPow[PART_LEN1];
+    float dPow[PART_LEN1];
+    float dMinPow[PART_LEN1];
+    float dInitMinPow[PART_LEN1];
+    float *noisePow;
+
+    float xfBuf[2][NR_PART * PART_LEN1]; // farend fft buffer
+    float wfBuf[2][NR_PART * PART_LEN1]; // filter fft
+    complex_t sde[PART_LEN1]; // cross-psd of nearend and error
+    complex_t sxd[PART_LEN1]; // cross-psd of farend and nearend
+    complex_t xfwBuf[NR_PART * PART_LEN1]; // farend windowed fft buffer
+
+    float sx[PART_LEN1], sd[PART_LEN1], se[PART_LEN1]; // far, near and error psd
+    float hNs[PART_LEN1];
+    float hNlFbMin, hNlFbLocalMin;
+    float hNlXdAvgMin;
+    int hNlNewMin, hNlMinCtr;
+    float overDrive, overDriveSm;
+    float targetSupp, minOverDrive;
+    float outBuf[PART_LEN];
+    int delayIdx;
+
+    short stNearState, echoState;
+    short divergeState;
+
+    int xfBufBlockPos;
+
+    short farBuf[FILT_LEN2 * 2];
+
+    short mult; // sampling frequency multiple
+    int sampFreq;
+    WebRtc_UWord32 seed;
+
+    float mu; // stepsize
+    float errThresh; // error threshold
+
+    int noiseEstCtr;
+
+    power_level_t farlevel;
+    power_level_t nearlevel;
+    power_level_t linoutlevel;
+    power_level_t nlpoutlevel;
+
+    int metricsMode;
+    int stateCounter;
+    stats_t erl;
+    stats_t erle;
+    stats_t aNlp;
+    stats_t rerl;
+
+    // Quantities to control H band scaling for SWB input
+    int freq_avg_ic;         //initial bin for averaging nlp gain
+    int flag_Hband_cn;      //for comfort noise
+    float cn_scale_Hband;   //scale for comfort noise in H band
+
+    int delay_histogram[kMaxDelay];
+    int delay_logging_enabled;
+    void* delay_estimator;
+
+#ifdef AEC_DEBUG
+    FILE *farFile;
+    FILE *nearFile;
+    FILE *outFile;
+    FILE *outLpFile;
+#endif
+} aec_t;
+
+typedef void (*WebRtcAec_FilterFar_t)(aec_t *aec, float yf[2][PART_LEN1]);
+extern WebRtcAec_FilterFar_t WebRtcAec_FilterFar;
+typedef void (*WebRtcAec_ScaleErrorSignal_t)(aec_t *aec, float ef[2][PART_LEN1]);
+extern WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal;
+typedef void (*WebRtcAec_FilterAdaptation_t)
+  (aec_t *aec, float *fft, float ef[2][PART_LEN1]);
+extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
+typedef void (*WebRtcAec_OverdriveAndSuppress_t)
+  (aec_t *aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]);
+extern WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;
+
+int WebRtcAec_CreateAec(aec_t **aec);
+int WebRtcAec_FreeAec(aec_t *aec);
+int WebRtcAec_InitAec(aec_t *aec, int sampFreq);
+void WebRtcAec_InitAec_SSE2(void);
+
+void WebRtcAec_InitMetrics(aec_t *aec);
+void WebRtcAec_ProcessFrame(aec_t *aec, const short *farend,
+                       const short *nearend, const short *nearendH,
+                       short *out, short *outH,
+                       int knownDelay);
+
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
+
--- a/webrtc/modules/audio_processing/aec/aec_core_sse2.c
+++ b/webrtc/modules/audio_processing/aec/aec_core_sse2.c
@@ -0,0 +1,417 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * The core AEC algorithm, SSE2 version of speed-critical functions.
+ */
+
+#include "typedefs.h"
+
+#if defined(WEBRTC_USE_SSE2)
+#include <emmintrin.h>
+#include <math.h>
+
+#include "aec_core.h"
+#include "aec_rdft.h"
+
+__inline static float MulRe(float aRe, float aIm, float bRe, float bIm)
+{
+  return aRe * bRe - aIm * bIm;
+}
+
+__inline static float MulIm(float aRe, float aIm, float bRe, float bIm)
+{
+  return aRe * bIm + aIm * bRe;
+}
+
+static void FilterFarSSE2(aec_t *aec, float yf[2][PART_LEN1])
+{
+  int i;
+  for (i = 0; i < NR_PART; i++) {
+    int j;
+    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
+    int pos = i * PART_LEN1;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= NR_PART) {
+      xPos -= NR_PART*(PART_LEN1);
+    }
+
+    // vectorized code (four at once)
+    for (j = 0; j + 3 < PART_LEN1; j += 4) {
+      const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
+      const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
+      const __m128 wfBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
+      const __m128 wfBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
+      const __m128 yf_re = _mm_loadu_ps(&yf[0][j]);
+      const __m128 yf_im = _mm_loadu_ps(&yf[1][j]);
+      const __m128 a = _mm_mul_ps(xfBuf_re, wfBuf_re);
+      const __m128 b = _mm_mul_ps(xfBuf_im, wfBuf_im);
+      const __m128 c = _mm_mul_ps(xfBuf_re, wfBuf_im);
+      const __m128 d = _mm_mul_ps(xfBuf_im, wfBuf_re);
+      const __m128 e = _mm_sub_ps(a, b);
+      const __m128 f = _mm_add_ps(c, d);
+      const __m128 g = _mm_add_ps(yf_re, e);
+      const __m128 h = _mm_add_ps(yf_im, f);
+      _mm_storeu_ps(&yf[0][j], g);
+      _mm_storeu_ps(&yf[1][j], h);
+    }
+    // scalar code for the remaining items.
+    for (; j < PART_LEN1; j++) {
+      yf[0][j] += MulRe(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j],
+                        aec->wfBuf[0][ pos + j], aec->wfBuf[1][ pos + j]);
+      yf[1][j] += MulIm(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j],
+                        aec->wfBuf[0][ pos + j], aec->wfBuf[1][ pos + j]);
+    }
+  }
+}
+
+static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
+{
+  const __m128 k1e_10f = _mm_set1_ps(1e-10f);
+  const __m128 kThresh = _mm_set1_ps(aec->errThresh);
+  const __m128 kMu = _mm_set1_ps(aec->mu);
+
+  int i;
+  // vectorized code (four at once)
+  for (i = 0; i + 3 < PART_LEN1; i += 4) {
+    const __m128 xPow = _mm_loadu_ps(&aec->xPow[i]);
+    const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]);
+    const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]);
+
+    const __m128 xPowPlus = _mm_add_ps(xPow, k1e_10f);
+    __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus);
+    __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus);
+    const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re);
+    const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im);
+    const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2);
+    const __m128 absEf = _mm_sqrt_ps(ef_sum2);
+    const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh);
+    __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f);
+    const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus);
+    __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv);
+    __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv);
+    ef_re_if = _mm_and_ps(bigger, ef_re_if);
+    ef_im_if = _mm_and_ps(bigger, ef_im_if);
+    ef_re = _mm_andnot_ps(bigger, ef_re);
+    ef_im = _mm_andnot_ps(bigger, ef_im);
+    ef_re = _mm_or_ps(ef_re, ef_re_if);
+    ef_im = _mm_or_ps(ef_im, ef_im_if);
+    ef_re = _mm_mul_ps(ef_re, kMu);
+    ef_im = _mm_mul_ps(ef_im, kMu);
+
+    _mm_storeu_ps(&ef[0][i], ef_re);
+    _mm_storeu_ps(&ef[1][i], ef_im);
+  }
+  // scalar code for the remaining items.
+  for (; i < (PART_LEN1); i++) {
+    float absEf;
+    ef[0][i] /= (aec->xPow[i] + 1e-10f);
+    ef[1][i] /= (aec->xPow[i] + 1e-10f);
+    absEf = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
+
+    if (absEf > aec->errThresh) {
+      absEf = aec->errThresh / (absEf + 1e-10f);
+      ef[0][i] *= absEf;
+      ef[1][i] *= absEf;
+    }
+
+    // Stepsize factor
+    ef[0][i] *= aec->mu;
+    ef[1][i] *= aec->mu;
+  }
+}
+
+static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1]) {
+  int i, j;
+  for (i = 0; i < NR_PART; i++) {
+    int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
+    int pos = i * PART_LEN1;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= NR_PART) {
+      xPos -= NR_PART * PART_LEN1;
+    }
+
+    // Process the whole array...
+    for (j = 0; j < PART_LEN; j+= 4) {
+      // Load xfBuf and ef.
+      const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
+      const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
+      const __m128 ef_re = _mm_loadu_ps(&ef[0][j]);
+      const __m128 ef_im = _mm_loadu_ps(&ef[1][j]);
+      // Calculate the product of conjugate(xfBuf) by ef.
+      //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm
+      //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe
+      const __m128 a = _mm_mul_ps(xfBuf_re, ef_re);
+      const __m128 b = _mm_mul_ps(xfBuf_im, ef_im);
+      const __m128 c = _mm_mul_ps(xfBuf_re, ef_im);
+      const __m128 d = _mm_mul_ps(xfBuf_im, ef_re);
+      const __m128 e = _mm_add_ps(a, b);
+      const __m128 f = _mm_sub_ps(c, d);
+      // Interleave real and imaginary parts.
+      const __m128 g = _mm_unpacklo_ps(e, f);
+      const __m128 h = _mm_unpackhi_ps(e, f);
+      // Store
+      _mm_storeu_ps(&fft[2*j + 0], g);
+      _mm_storeu_ps(&fft[2*j + 4], h);
+    }
+    // ... and fixup the first imaginary entry.
+    fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
+                   -aec->xfBuf[1][xPos + PART_LEN],
+                   ef[0][PART_LEN], ef[1][PART_LEN]);
+
+    aec_rdft_inverse_128(fft);
+    memset(fft + PART_LEN, 0, sizeof(float)*PART_LEN);
+
+    // fft scaling
+    {
+      float scale = 2.0f / PART_LEN2;
+      const __m128 scale_ps = _mm_load_ps1(&scale);
+      for (j = 0; j < PART_LEN; j+=4) {
+        const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
+        const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
+        _mm_storeu_ps(&fft[j], fft_scale);
+      }
+    }
+    aec_rdft_forward_128(fft);
+
+    {
+      float wt1 = aec->wfBuf[1][pos];
+      aec->wfBuf[0][pos + PART_LEN] += fft[1];
+      for (j = 0; j < PART_LEN; j+= 4) {
+        __m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
+        __m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
+        const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
+        const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
+        const __m128 fft_re = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2 ,0));
+        const __m128 fft_im = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3 ,1));
+        wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
+        wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
+        _mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);
+        _mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im);
+      }
+      aec->wfBuf[1][pos] = wt1;
+    }
+  }
+}
+
+static __m128 mm_pow_ps(__m128 a, __m128 b)
+{
+  // a^b = exp2(b * log2(a))
+  //   exp2(x) and log2(x) are calculated using polynomial approximations.
+  __m128 log2_a, b_log2_a, a_exp_b;
+
+  // Calculate log2(x), x = a.
+  {
+    // To calculate log2(x), we decompose x like this:
+    //   x = y * 2^n
+    //     n is an integer
+    //     y is in the [1.0, 2.0) range
+    //
+    //   log2(x) = log2(y) + n
+    //     n       can be evaluated by playing with float representation.
+    //     log2(y) in a small range can be approximated, this code uses an order
+    //             five polynomial approximation. The coefficients have been
+    //             estimated with the Remez algorithm and the resulting
+    //             polynomial has a maximum relative error of 0.00086%.
+
+    // Compute n.
+    //    This is done by masking the exponent, shifting it into the top bit of
+    //    the mantissa, putting eight into the biased exponent (to shift/
+    //    compensate the fact that the exponent has been shifted in the top/
+    //    fractional part and finally getting rid of the implicit leading one
+    //    from the mantissa by substracting it out.
+    static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END =
+        {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
+    static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END =
+        {0x43800000, 0x43800000, 0x43800000, 0x43800000};
+    static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END =
+        {0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
+    static const int shift_exponent_into_top_mantissa = 8;
+    const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask));
+    const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(two_n),
+        shift_exponent_into_top_mantissa));
+    const __m128 n_0 = _mm_or_ps(n_1, *((__m128 *)eight_biased_exponent));
+    const __m128 n   = _mm_sub_ps(n_0,  *((__m128 *)implicit_leading_one));
+
+    // Compute y.
+    static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END =
+        {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
+    static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END =
+        {0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
+    const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask));
+    const __m128 y        = _mm_or_ps(
+        mantissa,  *((__m128 *)zero_biased_exponent_is_one));
+
+    // Approximate log2(y) ~= (y - 1) * pol5(y).
+    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
+    static const ALIGN16_BEG float ALIGN16_END C5[4] =
+        {-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
+    static const ALIGN16_BEG float ALIGN16_END C4[4] =
+        {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
+    static const ALIGN16_BEG float ALIGN16_END C3[4] =
+        {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
+    static const ALIGN16_BEG float ALIGN16_END C2[4] =
+        {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
+    static const ALIGN16_BEG float ALIGN16_END C1[4] =
+        {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
+    static const ALIGN16_BEG float ALIGN16_END C0[4] =
+        {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
+    const __m128 pol5_y_0 = _mm_mul_ps(y,        *((__m128 *)C5));
+    const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4));
+    const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
+    const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3));
+    const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
+    const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2));
+    const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
+    const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1));
+    const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
+    const __m128 pol5_y   = _mm_add_ps(pol5_y_8, *((__m128 *)C0));
+    const __m128 y_minus_one = _mm_sub_ps(
+        y, *((__m128 *)zero_biased_exponent_is_one));
+    const __m128 log2_y = _mm_mul_ps(y_minus_one ,  pol5_y);
+
+    // Combine parts.
+    log2_a = _mm_add_ps(n, log2_y);
+  }
+
+  // b * log2(a)
+  b_log2_a = _mm_mul_ps(b, log2_a);
+
+  // Calculate exp2(x), x = b * log2(a).
+  {
+    // To calculate 2^x, we decompose x like this:
+    //   x = n + y
+    //     n is an integer, the value of x - 0.5 rounded down, therefore
+    //     y is in the [0.5, 1.5) range
+    //
+    //   2^x = 2^n * 2^y
+    //     2^n can be evaluated by playing with float representation.
+    //     2^y in a small range can be approximated, this code uses an order two
+    //         polynomial approximation. The coefficients have been estimated
+    //         with the Remez algorithm and the resulting polynomial has a
+    //         maximum relative error of 0.17%.
+
+    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
+    static const ALIGN16_BEG float max_input[4] ALIGN16_END =
+        {129.f, 129.f, 129.f, 129.f};
+    static const ALIGN16_BEG float min_input[4] ALIGN16_END =
+        {-126.99999f, -126.99999f, -126.99999f, -126.99999f};
+    const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input));
+    const __m128 x_max = _mm_max_ps(x_min,    *((__m128 *)min_input));
+    // Compute n.
+    static const ALIGN16_BEG float half[4] ALIGN16_END =
+        {0.5f, 0.5f, 0.5f, 0.5f};
+    const __m128  x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half));
+    const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
+    // Compute 2^n.
+    static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END =
+        {127, 127, 127, 127};
+    static const int float_exponent_shift = 23;
+    const __m128i two_n_exponent = _mm_add_epi32(
+        x_minus_half_floor, *((__m128i *)float_exponent_bias));
+    const __m128  two_n = _mm_castsi128_ps(_mm_slli_epi32(
+        two_n_exponent, float_exponent_shift));
+    // Compute y.
+    const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
+    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
+    static const ALIGN16_BEG float C2[4] ALIGN16_END =
+        {3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
+    static const ALIGN16_BEG float C1[4] ALIGN16_END =
+        {6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
+    static const ALIGN16_BEG float C0[4] ALIGN16_END =
+        {1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f};
+    const __m128 exp2_y_0 = _mm_mul_ps(y,        *((__m128 *)C2));
+    const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1));
+    const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
+    const __m128 exp2_y   = _mm_add_ps(exp2_y_2, *((__m128 *)C0));
+
+    // Combine parts.
+    a_exp_b = _mm_mul_ps(exp2_y, two_n);
+  }
+  return a_exp_b;
+}
+
+extern const float WebRtcAec_weightCurve[65];
+extern const float WebRtcAec_overDriveCurve[65];
+
+static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
+                                     const float hNlFb,
+                                     float efw[2][PART_LEN1]) {
+  int i;
+  const __m128 vec_hNlFb = _mm_set1_ps(hNlFb);
+  const __m128 vec_one = _mm_set1_ps(1.0f);
+  const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
+  const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
+  // vectorized code (four at once)
+  for (i = 0; i + 3 < PART_LEN1; i+=4) {
+    // Weight subbands
+    __m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
+    const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
+    const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
+    const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(
+        vec_weightCurve, vec_hNlFb);
+    const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
+    const __m128 vec_one_weightCurve_hNl = _mm_mul_ps(
+        vec_one_weightCurve, vec_hNl);
+    const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
+    const __m128 vec_if1 = _mm_and_ps(
+        bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
+    vec_hNl = _mm_or_ps(vec_if0, vec_if1);
+
+    {
+      const __m128 vec_overDriveCurve = _mm_loadu_ps(
+          &WebRtcAec_overDriveCurve[i]);
+      const __m128 vec_overDriveSm_overDriveCurve = _mm_mul_ps(
+          vec_overDriveSm, vec_overDriveCurve);
+      vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
+      _mm_storeu_ps(&hNl[i], vec_hNl);
+    }
+
+    // Suppress error signal
+    {
+      __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]);
+      __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]);
+      vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl);
+      vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl);
+
+      // Ooura fft returns incorrect sign on imaginary component. It matters
+      // here because we are making an additive change with comfort noise.
+      vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one);
+      _mm_storeu_ps(&efw[0][i], vec_efw_re);
+      _mm_storeu_ps(&efw[1][i], vec_efw_im);
+    }
+  }
+  // scalar code for the remaining items.
+  for (; i < PART_LEN1; i++) {
+    // Weight subbands
+    if (hNl[i] > hNlFb) {
+      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
+          (1 - WebRtcAec_weightCurve[i]) * hNl[i];
+    }
+    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
+
+    // Suppress error signal
+    efw[0][i] *= hNl[i];
+    efw[1][i] *= hNl[i];
+
+    // Ooura fft returns incorrect sign on imaginary component. It matters
+    // here because we are making an additive change with comfort noise.
+    efw[1][i] *= -1;
+  }
+}
+
+void WebRtcAec_InitAec_SSE2(void) {
+  WebRtcAec_FilterFar = FilterFarSSE2;
+  WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;
+  WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
+  WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
+}
+
+#endif   // WEBRTC_USE_SSE2
--- a/webrtc/modules/audio_processing/aec/aec_rdft.c
+++ b/webrtc/modules/audio_processing/aec/aec_rdft.c
@@ -0,0 +1,587 @@
+/*
+ * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
+ * Copyright Takuya OOURA, 1996-2001
+ *
+ * You may use, copy, modify and distribute this code for any purpose (include
+ * commercial use) and without fee. Please refer to this package when you modify
+ * this code.
+ *
+ * Changes by the WebRTC authors:
+ *    - Trivial type modifications.
+ *    - Minimal code subset to do rdft of length 128.
+ *    - Optimizations because of known length.
+ *
+ *  All changes are covered by the WebRTC license and IP grant:
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "aec_rdft.h"
+
+#include <math.h>
+
+#include "system_wrappers/interface/cpu_features_wrapper.h"
+#include "typedefs.h"
+
+// constants shared by all paths (C, SSE2).
+float rdft_w[64];
+// constants used by the C path.
+float rdft_wk3ri_first[32];
+float rdft_wk3ri_second[32];
+// constants used by SSE2 but initialized in C path.
+ALIGN16_BEG float ALIGN16_END rdft_wk1r[32];
+ALIGN16_BEG float ALIGN16_END rdft_wk2r[32];
+ALIGN16_BEG float ALIGN16_END rdft_wk3r[32];
+ALIGN16_BEG float ALIGN16_END rdft_wk1i[32];
+ALIGN16_BEG float ALIGN16_END rdft_wk2i[32];
+ALIGN16_BEG float ALIGN16_END rdft_wk3i[32];
+ALIGN16_BEG float ALIGN16_END cftmdl_wk1r[4];
+
+static int ip[16];
+
+static void bitrv2_32or128(int n, int *ip, float *a) {
+  // n is 32 or 128
+  int j, j1, k, k1, m, m2;
+  float xr, xi, yr, yi;
+
+  ip[0] = 0;
+  {
+    int l = n;
+    m = 1;
+    while ((m << 3) < l) {
+      l >>= 1;
+      for (j = 0; j < m; j++) {
+        ip[m + j] = ip[j] + l;
+      }
+      m <<= 1;
+    }
+  }
+  m2 = 2 * m;
+  for (k = 0; k < m; k++) {
+    for (j = 0; j < k; j++) {
+      j1 = 2 * j + ip[k];
+      k1 = 2 * k + ip[j];
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 += m2;
+      k1 += 2 * m2;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 += m2;
+      k1 -= m2;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 += m2;
+      k1 += 2 * m2;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+    }
+    j1 = 2 * k + m2 + ip[k];
+    k1 = j1 + m2;
+    xr = a[j1];
+    xi = a[j1 + 1];
+    yr = a[k1];
+    yi = a[k1 + 1];
+    a[j1] = yr;
+    a[j1 + 1] = yi;
+    a[k1] = xr;
+    a[k1 + 1] = xi;
+  }
+}
+
+static void makewt_32(void) {
+  const int nw = 32;
+  int j, nwh;
+  float delta, x, y;
+
+  ip[0] = nw;
+  ip[1] = 1;
+  nwh = nw >> 1;
+  delta = atanf(1.0f) / nwh;
+  rdft_w[0] = 1;
+  rdft_w[1] = 0;
+  rdft_w[nwh] = cosf(delta * nwh);
+  rdft_w[nwh + 1] = rdft_w[nwh];
+  for (j = 2; j < nwh; j += 2) {
+    x = cosf(delta * j);
+    y = sinf(delta * j);
+    rdft_w[j] = x;
+    rdft_w[j + 1] = y;
+    rdft_w[nw - j] = y;
+    rdft_w[nw - j + 1] = x;
+  }
+  bitrv2_32or128(nw, ip + 2, rdft_w);
+
+  // pre-calculate constants used by cft1st_128 and cftmdl_128...
+  cftmdl_wk1r[0] = rdft_w[2];
+  cftmdl_wk1r[1] = rdft_w[2];
+  cftmdl_wk1r[2] = rdft_w[2];
+  cftmdl_wk1r[3] = -rdft_w[2];
+  {
+    int k1;
+
+    for (k1 = 0, j = 0; j < 128; j += 16, k1 += 2) {
+      const int k2 = 2 * k1;
+      const float wk2r = rdft_w[k1 + 0];
+      const float wk2i = rdft_w[k1 + 1];
+      float wk1r, wk1i;
+      // ... scalar version.
+      wk1r = rdft_w[k2 + 0];
+      wk1i = rdft_w[k2 + 1];
+      rdft_wk3ri_first[k1 + 0] = wk1r - 2 * wk2i * wk1i;
+      rdft_wk3ri_first[k1 + 1] = 2 * wk2i * wk1r - wk1i;
+      wk1r = rdft_w[k2 + 2];
+      wk1i = rdft_w[k2 + 3];
+      rdft_wk3ri_second[k1 + 0] = wk1r - 2 * wk2r * wk1i;
+      rdft_wk3ri_second[k1 + 1] = 2 * wk2r * wk1r - wk1i;
+      // ... vector version.
+      rdft_wk1r[k2 + 0] = rdft_w[k2 + 0];
+      rdft_wk1r[k2 + 1] = rdft_w[k2 + 0];
+      rdft_wk1r[k2 + 2] = rdft_w[k2 + 2];
+      rdft_wk1r[k2 + 3] = rdft_w[k2 + 2];
+      rdft_wk2r[k2 + 0] = rdft_w[k1 + 0];
+      rdft_wk2r[k2 + 1] = rdft_w[k1 + 0];
+      rdft_wk2r[k2 + 2] = -rdft_w[k1 + 1];
+      rdft_wk2r[k2 + 3] = -rdft_w[k1 + 1];
+      rdft_wk3r[k2 + 0] = rdft_wk3ri_first[k1 + 0];
+      rdft_wk3r[k2 + 1] = rdft_wk3ri_first[k1 + 0];
+      rdft_wk3r[k2 + 2] = rdft_wk3ri_second[k1 + 0];
+      rdft_wk3r[k2 + 3] = rdft_wk3ri_second[k1 + 0];
+      rdft_wk1i[k2 + 0] = -rdft_w[k2 + 1];
+      rdft_wk1i[k2 + 1] = rdft_w[k2 + 1];
+      rdft_wk1i[k2 + 2] = -rdft_w[k2 + 3];
+      rdft_wk1i[k2 + 3] = rdft_w[k2 + 3];
+      rdft_wk2i[k2 + 0] = -rdft_w[k1 + 1];
+      rdft_wk2i[k2 + 1] = rdft_w[k1 + 1];
+      rdft_wk2i[k2 + 2] = -rdft_w[k1 + 0];
+      rdft_wk2i[k2 + 3] = rdft_w[k1 + 0];
+      rdft_wk3i[k2 + 0] = -rdft_wk3ri_first[k1 + 1];
+      rdft_wk3i[k2 + 1] = rdft_wk3ri_first[k1 + 1];
+      rdft_wk3i[k2 + 2] = -rdft_wk3ri_second[k1 + 1];
+      rdft_wk3i[k2 + 3] = rdft_wk3ri_second[k1 + 1];
+    }
+  }
+}
+
+static void makect_32(void) {
+  float *c = rdft_w + 32;
+  const int nc = 32;
+  int j, nch;
+  float delta;
+
+  ip[1] = nc;
+  nch = nc >> 1;
+  delta = atanf(1.0f) / nch;
+  c[0] = cosf(delta * nch);
+  c[nch] = 0.5f * c[0];
+  for (j = 1; j < nch; j++) {
+    c[j] = 0.5f * cosf(delta * j);
+    c[nc - j] = 0.5f * sinf(delta * j);
+  }
+}
+
+static void cft1st_128_C(float *a) {
+  const int n = 128;
+  int j, k1, k2;
+  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  x0r = a[0] + a[2];
+  x0i = a[1] + a[3];
+  x1r = a[0] - a[2];
+  x1i = a[1] - a[3];
+  x2r = a[4] + a[6];
+  x2i = a[5] + a[7];
+  x3r = a[4] - a[6];
+  x3i = a[5] - a[7];
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[4] = x0r - x2r;
+  a[5] = x0i - x2i;
+  a[2] = x1r - x3i;
+  a[3] = x1i + x3r;
+  a[6] = x1r + x3i;
+  a[7] = x1i - x3r;
+  wk1r = rdft_w[2];
+  x0r = a[8] + a[10];
+  x0i = a[9] + a[11];
+  x1r = a[8] - a[10];
+  x1i = a[9] - a[11];
+  x2r = a[12] + a[14];
+  x2i = a[13] + a[15];
+  x3r = a[12] - a[14];
+  x3i = a[13] - a[15];
+  a[8] = x0r + x2r;
+  a[9] = x0i + x2i;
+  a[12] = x2i - x0i;
+  a[13] = x0r - x2r;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  a[10] = wk1r * (x0r - x0i);
+  a[11] = wk1r * (x0r + x0i);
+  x0r = x3i + x1r;
+  x0i = x3r - x1i;
+  a[14] = wk1r * (x0i - x0r);
+  a[15] = wk1r * (x0i + x0r);
+  k1 = 0;
+  for (j = 16; j < n; j += 16) {
+    k1 += 2;
+    k2 = 2 * k1;
+    wk2r = rdft_w[k1 + 0];
+    wk2i = rdft_w[k1 + 1];
+    wk1r = rdft_w[k2 + 0];
+    wk1i = rdft_w[k2 + 1];
+    wk3r = rdft_wk3ri_first[k1 + 0];
+    wk3i = rdft_wk3ri_first[k1 + 1];
+    x0r = a[j + 0] + a[j + 2];
+    x0i = a[j + 1] + a[j + 3];
+    x1r = a[j + 0] - a[j + 2];
+    x1i = a[j + 1] - a[j + 3];
+    x2r = a[j + 4] + a[j + 6];
+    x2i = a[j + 5] + a[j + 7];
+    x3r = a[j + 4] - a[j + 6];
+    x3i = a[j + 5] - a[j + 7];
+    a[j + 0] = x0r + x2r;
+    a[j + 1] = x0i + x2i;
+    x0r -= x2r;
+    x0i -= x2i;
+    a[j + 4] = wk2r * x0r - wk2i * x0i;
+    a[j + 5] = wk2r * x0i + wk2i * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j + 2] = wk1r * x0r - wk1i * x0i;
+    a[j + 3] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j + 6] = wk3r * x0r - wk3i * x0i;
+    a[j + 7] = wk3r * x0i + wk3i * x0r;
+    wk1r = rdft_w[k2 + 2];
+    wk1i = rdft_w[k2 + 3];
+    wk3r = rdft_wk3ri_second[k1 + 0];
+    wk3i = rdft_wk3ri_second[k1 + 1];
+    x0r = a[j + 8] + a[j + 10];
+    x0i = a[j + 9] + a[j + 11];
+    x1r = a[j + 8] - a[j + 10];
+    x1i = a[j + 9] - a[j + 11];
+    x2r = a[j + 12] + a[j + 14];
+    x2i = a[j + 13] + a[j + 15];
+    x3r = a[j + 12] - a[j + 14];
+    x3i = a[j + 13] - a[j + 15];
+    a[j + 8] = x0r + x2r;
+    a[j + 9] = x0i + x2i;
+    x0r -= x2r;
+    x0i -= x2i;
+    a[j + 12] = -wk2i * x0r - wk2r * x0i;
+    a[j + 13] = -wk2i * x0i + wk2r * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j + 10] = wk1r * x0r - wk1i * x0i;
+    a[j + 11] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j + 14] = wk3r * x0r - wk3i * x0i;
+    a[j + 15] = wk3r * x0i + wk3i * x0r;
+  }
+}
+
+static void cftmdl_128_C(float *a) {
+  const int l = 8;
+  const int n = 128;
+  const int m = 32;
+  int j0, j1, j2, j3, k, k1, k2, m2;
+  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  for (j0 = 0; j0 < l; j0 += 2) {
+    j1 = j0 +  8;
+    j2 = j0 + 16;
+    j3 = j0 + 24;
+    x0r = a[j0 + 0] + a[j1 + 0];
+    x0i = a[j0 + 1] + a[j1 + 1];
+    x1r = a[j0 + 0] - a[j1 + 0];
+    x1i = a[j0 + 1] - a[j1 + 1];
+    x2r = a[j2 + 0] + a[j3 + 0];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2 + 0] - a[j3 + 0];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j0 + 0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j2 + 0] = x0r - x2r;
+    a[j2 + 1] = x0i - x2i;
+    a[j1 + 0] = x1r - x3i;
+    a[j1 + 1] = x1i + x3r;
+    a[j3 + 0] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+  }
+  wk1r = rdft_w[2];
+  for (j0 = m; j0 < l + m; j0 += 2) {
+    j1 = j0 +  8;
+    j2 = j0 + 16;
+    j3 = j0 + 24;
+    x0r = a[j0 + 0] + a[j1 + 0];
+    x0i = a[j0 + 1] + a[j1 + 1];
+    x1r = a[j0 + 0] - a[j1 + 0];
+    x1i = a[j0 + 1] - a[j1 + 1];
+    x2r = a[j2 + 0] + a[j3 + 0];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2 + 0] - a[j3 + 0];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j0 + 0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j2 + 0] = x2i - x0i;
+    a[j2 + 1] = x0r - x2r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j1 + 0] = wk1r * (x0r - x0i);
+    a[j1 + 1] = wk1r * (x0r + x0i);
+    x0r = x3i + x1r;
+    x0i = x3r - x1i;
+    a[j3 + 0] = wk1r * (x0i - x0r);
+    a[j3 + 1] = wk1r * (x0i + x0r);
+  }
+  k1 = 0;
+  m2 = 2 * m;
+  for (k = m2; k < n; k += m2) {
+    k1 += 2;
+    k2 = 2 * k1;
+    wk2r = rdft_w[k1 + 0];
+    wk2i = rdft_w[k1 + 1];
+    wk1r = rdft_w[k2 + 0];
+    wk1i = rdft_w[k2 + 1];
+    wk3r = rdft_wk3ri_first[k1 + 0];
+    wk3i = rdft_wk3ri_first[k1 + 1];
+    for (j0 = k; j0 < l + k; j0 += 2) {
+      j1 = j0 +  8;
+      j2 = j0 + 16;
+      j3 = j0 + 24;
+      x0r = a[j0 + 0] + a[j1 + 0];
+      x0i = a[j0 + 1] + a[j1 + 1];
+      x1r = a[j0 + 0] - a[j1 + 0];
+      x1i = a[j0 + 1] - a[j1 + 1];
+      x2r = a[j2 + 0] + a[j3 + 0];
+      x2i = a[j2 + 1] + a[j3 + 1];
+      x3r = a[j2 + 0] - a[j3 + 0];
+      x3i = a[j2 + 1] - a[j3 + 1];
+      a[j0 + 0] = x0r + x2r;
+      a[j0 + 1] = x0i + x2i;
+      x0r -= x2r;
+      x0i -= x2i;
+      a[j2 + 0] = wk2r * x0r - wk2i * x0i;
+      a[j2 + 1] = wk2r * x0i + wk2i * x0r;
+      x0r = x1r - x3i;
+      x0i = x1i + x3r;
+      a[j1 + 0] = wk1r * x0r - wk1i * x0i;
+      a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+      x0r = x1r + x3i;
+      x0i = x1i - x3r;
+      a[j3 + 0] = wk3r * x0r - wk3i * x0i;
+      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+    }
+    wk1r = rdft_w[k2 + 2];
+    wk1i = rdft_w[k2 + 3];
+    wk3r = rdft_wk3ri_second[k1 + 0];
+    wk3i = rdft_wk3ri_second[k1 + 1];
+    for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
+      j1 = j0 +  8;
+      j2 = j0 + 16;
+      j3 = j0 + 24;
+      x0r = a[j0 + 0] + a[j1 + 0];
+      x0i = a[j0 + 1] + a[j1 + 1];
+      x1r = a[j0 + 0] - a[j1 + 0];
+      x1i = a[j0 + 1] - a[j1 + 1];
+      x2r = a[j2 + 0] + a[j3 + 0];
+      x2i = a[j2 + 1] + a[j3 + 1];
+      x3r = a[j2 + 0] - a[j3 + 0];
+      x3i = a[j2 + 1] - a[j3 + 1];
+      a[j0 + 0] = x0r + x2r;
+      a[j0 + 1] = x0i + x2i;
+      x0r -= x2r;
+      x0i -= x2i;
+      a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
+      a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
+      x0r = x1r - x3i;
+      x0i = x1i + x3r;
+      a[j1 + 0] = wk1r * x0r - wk1i * x0i;
+      a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+      x0r = x1r + x3i;
+      x0i = x1i - x3r;
+      a[j3 + 0] = wk3r * x0r - wk3i * x0i;
+      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+    }
+  }
+}
+
+static void cftfsub_128(float *a) {
+  int j, j1, j2, j3, l;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  cft1st_128(a);
+  cftmdl_128(a);
+  l = 32;
+  for (j = 0; j < l; j += 2) {
+    j1 = j + l;
+    j2 = j1 + l;
+    j3 = j2 + l;
+    x0r = a[j] + a[j1];
+    x0i = a[j + 1] + a[j1 + 1];
+    x1r = a[j] - a[j1];
+    x1i = a[j + 1] - a[j1 + 1];
+    x2r = a[j2] + a[j3];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2] - a[j3];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i + x2i;
+    a[j2] = x0r - x2r;
+    a[j2 + 1] = x0i - x2i;
+    a[j1] = x1r - x3i;
+    a[j1 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+  }
+}
+
+static void cftbsub_128(float *a) {
+  int j, j1, j2, j3, l;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  cft1st_128(a);
+  cftmdl_128(a);
+  l = 32;
+
+  for (j = 0; j < l; j += 2) {
+    j1 = j + l;
+    j2 = j1 + l;
+    j3 = j2 + l;
+    x0r = a[j] + a[j1];
+    x0i = -a[j + 1] - a[j1 + 1];
+    x1r = a[j] - a[j1];
+    x1i = -a[j + 1] + a[j1 + 1];
+    x2r = a[j2] + a[j3];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2] - a[j3];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i - x2i;
+    a[j2] = x0r - x2r;
+    a[j2 + 1] = x0i + x2i;
+    a[j1] = x1r - x3i;
+    a[j1 + 1] = x1i - x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i + x3r;
+  }
+}
+
+static void rftfsub_128_C(float *a) {
+  const float *c = rdft_w + 32;
+  int j1, j2, k1, k2;
+  float wkr, wki, xr, xi, yr, yi;
+
+  for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
+    k2 = 128 - j2;
+    k1 =  32 - j1;
+    wkr = 0.5f - c[k1];
+    wki = c[j1];
+    xr = a[j2 + 0] - a[k2 + 0];
+    xi = a[j2 + 1] + a[k2 + 1];
+    yr = wkr * xr - wki * xi;
+    yi = wkr * xi + wki * xr;
+    a[j2 + 0] -= yr;
+    a[j2 + 1] -= yi;
+    a[k2 + 0] += yr;
+    a[k2 + 1] -= yi;
+  }
+}
+
+static void rftbsub_128_C(float *a) {
+  const float *c = rdft_w + 32;
+  int j1, j2, k1, k2;
+  float wkr, wki, xr, xi, yr, yi;
+
+  a[1] = -a[1];
+  for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
+    k2 = 128 - j2;
+    k1 =  32 - j1;
+    wkr = 0.5f - c[k1];
+    wki = c[j1];
+    xr = a[j2 + 0] - a[k2 + 0];
+    xi = a[j2 + 1] + a[k2 + 1];
+    yr = wkr * xr + wki * xi;
+    yi = wkr * xi - wki * xr;
+    a[j2 + 0] = a[j2 + 0] - yr;
+    a[j2 + 1] = yi - a[j2 + 1];
+    a[k2 + 0] = yr + a[k2 + 0];
+    a[k2 + 1] = yi - a[k2 + 1];
+  }
+  a[65] = -a[65];
+}
+
+void aec_rdft_forward_128(float *a) {
+  const int n = 128;
+  float xi;
+
+  bitrv2_32or128(n, ip + 2, a);
+  cftfsub_128(a);
+  rftfsub_128(a);
+  xi = a[0] - a[1];
+  a[0] += a[1];
+  a[1] = xi;
+}
+
+void aec_rdft_inverse_128(float *a) {
+  const int n = 128;
+
+  a[1] = 0.5f * (a[0] - a[1]);
+  a[0] -= a[1];
+  rftbsub_128(a);
+  bitrv2_32or128(n, ip + 2, a);
+  cftbsub_128(a);
+}
+
+// code path selection
+rft_sub_128_t cft1st_128;
+rft_sub_128_t cftmdl_128;
+rft_sub_128_t rftfsub_128;
+rft_sub_128_t rftbsub_128;
+
+void aec_rdft_init(void) {
+  cft1st_128 = cft1st_128_C;
+  cftmdl_128 = cftmdl_128_C;
+  rftfsub_128 = rftfsub_128_C;
+  rftbsub_128 = rftbsub_128_C;
+  if (WebRtc_GetCPUInfo(kSSE2)) {
+#if defined(WEBRTC_USE_SSE2)
+    aec_rdft_init_sse2();
+#endif
+  }
+  // init library constants.
+  makewt_32();
+  makect_32();
+}
--- a/webrtc/modules/audio_processing/aec/aec_rdft.h
+++ b/webrtc/modules/audio_processing/aec/aec_rdft.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
+
+// These intrinsics were unavailable before VS 2008.
+// TODO(andrew): move to a common file.
+#if defined(_MSC_VER) && _MSC_VER < 1500
+#include <emmintrin.h>
+static __inline __m128 _mm_castsi128_ps(__m128i a) { return *(__m128*)&a; }
+static __inline __m128i _mm_castps_si128(__m128 a) { return *(__m128i*)&a; }
+#endif
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+// constants shared by all paths (C, SSE2).
+extern float rdft_w[64];
+// constants used by the C path.
+extern float rdft_wk3ri_first[32];
+extern float rdft_wk3ri_second[32];
+// constants used by SSE2 but initialized in C path.
+extern float rdft_wk1r[32];
+extern float rdft_wk2r[32];
+extern float rdft_wk3r[32];
+extern float rdft_wk1i[32];
+extern float rdft_wk2i[32];
+extern float rdft_wk3i[32];
+extern float cftmdl_wk1r[4];
+
+// code path selection function pointers
+typedef void (*rft_sub_128_t)(float *a);
+extern rft_sub_128_t rftfsub_128;
+extern rft_sub_128_t rftbsub_128;
+extern rft_sub_128_t cft1st_128;
+extern rft_sub_128_t cftmdl_128;
+
+// entry points
+void aec_rdft_init(void);
+void aec_rdft_init_sse2(void);
+void aec_rdft_forward_128(float *a);
+void aec_rdft_inverse_128(float *a);
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
--- a/webrtc/modules/audio_processing/aec/aec_rdft_sse2.c
+++ b/webrtc/modules/audio_processing/aec/aec_rdft_sse2.c
@@ -0,0 +1,431 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "typedefs.h"
+
+#if defined(WEBRTC_USE_SSE2)
+#include <emmintrin.h>
+
+#include "aec_rdft.h"
+
+static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] =
+  {-1.f, 1.f, -1.f, 1.f};
+
+static void cft1st_128_SSE2(float *a) {
+  const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
+  int j, k2;
+
+  for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
+          __m128 a00v   = _mm_loadu_ps(&a[j +  0]);
+          __m128 a04v   = _mm_loadu_ps(&a[j +  4]);
+          __m128 a08v   = _mm_loadu_ps(&a[j +  8]);
+          __m128 a12v   = _mm_loadu_ps(&a[j + 12]);
+          __m128 a01v   = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1 ,0));
+          __m128 a23v   = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3 ,2));
+          __m128 a45v   = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1 ,0));
+          __m128 a67v   = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3 ,2));
+
+    const __m128 wk1rv  = _mm_load_ps(&rdft_wk1r[k2]);
+    const __m128 wk1iv  = _mm_load_ps(&rdft_wk1i[k2]);
+    const __m128 wk2rv  = _mm_load_ps(&rdft_wk2r[k2]);
+    const __m128 wk2iv  = _mm_load_ps(&rdft_wk2i[k2]);
+    const __m128 wk3rv  = _mm_load_ps(&rdft_wk3r[k2]);
+    const __m128 wk3iv  = _mm_load_ps(&rdft_wk3i[k2]);
+          __m128 x0v    = _mm_add_ps(a01v, a23v);
+    const __m128 x1v    = _mm_sub_ps(a01v, a23v);
+    const __m128 x2v    = _mm_add_ps(a45v, a67v);
+    const __m128 x3v    = _mm_sub_ps(a45v, a67v);
+          __m128 x0w;
+                 a01v   = _mm_add_ps(x0v, x2v);
+                 x0v    = _mm_sub_ps(x0v, x2v);
+                 x0w    = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
+    {
+      const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
+      const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
+                   a45v   = _mm_add_ps(a45_0v, a45_1v);
+    }
+    {
+            __m128 a23_0v, a23_1v;
+      const __m128 x3w    = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0 ,1));
+      const __m128 x3s    = _mm_mul_ps(mm_swap_sign, x3w);
+                   x0v    = _mm_add_ps(x1v, x3s);
+                   x0w    = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
+                   a23_0v = _mm_mul_ps(wk1rv, x0v);
+                   a23_1v = _mm_mul_ps(wk1iv, x0w);
+                   a23v   = _mm_add_ps(a23_0v, a23_1v);
+
+                   x0v    = _mm_sub_ps(x1v, x3s);
+                   x0w    = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
+    }
+    {
+      const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
+      const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
+                   a67v   = _mm_add_ps(a67_0v, a67_1v);
+    }
+
+                 a00v   = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1 ,0));
+                 a04v   = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1 ,0));
+                 a08v   = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3 ,2));
+                 a12v   = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3 ,2));
+    _mm_storeu_ps(&a[j +  0], a00v);
+    _mm_storeu_ps(&a[j +  4], a04v);
+    _mm_storeu_ps(&a[j +  8], a08v);
+    _mm_storeu_ps(&a[j + 12], a12v);
+  }
+}
+
+static void cftmdl_128_SSE2(float *a) {
+  const int l = 8;
+  const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
+  int j0;
+
+  __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
+  for (j0 = 0; j0 < l; j0 += 2) {
+    const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 +  0]);
+    const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 +  8]);
+    const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
+    const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
+    const __m128  a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
+                                           _mm_castsi128_ps(a_32),
+                                           _MM_SHUFFLE(1, 0, 1 ,0));
+    const __m128  a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
+                                           _mm_castsi128_ps(a_40),
+                                           _MM_SHUFFLE(1, 0, 1 ,0));
+          __m128  x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
+    const __m128  x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
+
+    const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
+    const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
+    const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
+    const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
+    const __m128  a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
+                                           _mm_castsi128_ps(a_48),
+                                           _MM_SHUFFLE(1, 0, 1 ,0));
+    const __m128  a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
+                                           _mm_castsi128_ps(a_56),
+                                           _MM_SHUFFLE(1, 0, 1 ,0));
+    const __m128  x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
+    const __m128  x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
+
+    const __m128  xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const __m128  xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+
+    const __m128  x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(
+        _mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1),
+                          _MM_SHUFFLE(2, 3, 0, 1)));
+    const __m128  x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
+    const __m128  x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+    const __m128  x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+
+    const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub,
+                                      _MM_SHUFFLE(2, 2, 2 ,2));
+    const __m128 yy1 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub,
+                                      _MM_SHUFFLE(3, 3, 3 ,3));
+    const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
+    const __m128 yy3 = _mm_add_ps(yy0, yy2);
+    const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
+
+    _mm_storel_epi64((__m128i*)&a[j0 +  0], _mm_castps_si128(xx0));
+    _mm_storel_epi64((__m128i*)&a[j0 + 32],
+                     _mm_shuffle_epi32(_mm_castps_si128(xx0),
+                                       _MM_SHUFFLE(3, 2, 3, 2)));
+
+    _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
+    _mm_storel_epi64((__m128i*)&a[j0 + 48],
+                     _mm_shuffle_epi32(_mm_castps_si128(xx1),
+                                       _MM_SHUFFLE(2, 3, 2, 3)));
+    a[j0 + 48] = -a[j0 + 48];
+
+    _mm_storel_epi64((__m128i*)&a[j0 +  8], _mm_castps_si128(x1_x3_add));
+    _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
+
+    _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
+    _mm_storel_epi64((__m128i*)&a[j0 + 56],
+                     _mm_shuffle_epi32(_mm_castps_si128(yy4),
+                     _MM_SHUFFLE(2, 3, 2, 3)));
+  }
+
+  {
+    int k = 64;
+    int k1 = 2;
+    int k2 = 2 * k1;
+    const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2+0]);
+    const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2+0]);
+    const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2+0]);
+    const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2+0]);
+    const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2+0]);
+                 wk1rv = _mm_load_ps(&rdft_wk1r[k2+0]);
+    for (j0 = k; j0 < l + k; j0 += 2) {
+      const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 +  0]);
+      const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 +  8]);
+      const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
+      const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
+      const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
+                                            _mm_castsi128_ps(a_32),
+                                            _MM_SHUFFLE(1, 0, 1 ,0));
+      const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
+                                            _mm_castsi128_ps(a_40),
+                                            _MM_SHUFFLE(1, 0, 1 ,0));
+            __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
+      const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
+
+      const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
+      const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
+      const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
+      const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
+      const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
+                                            _mm_castsi128_ps(a_48),
+                                            _MM_SHUFFLE(1, 0, 1 ,0));
+      const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
+                                            _mm_castsi128_ps(a_56),
+                                            _MM_SHUFFLE(1, 0, 1 ,0));
+      const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
+      const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
+
+      const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const __m128 xx2 = _mm_mul_ps(xx1 , wk2rv);
+      const __m128 xx3 = _mm_mul_ps(wk2iv,
+          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1),
+                                             _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx4 = _mm_add_ps(xx2, xx3);
+
+      const __m128  x3i0_3r0_3i1_x3r1 =  _mm_castsi128_ps(
+          _mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1),
+                            _MM_SHUFFLE(2, 3, 0, 1)));
+      const __m128  x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
+      const __m128  x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+      const __m128  x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+
+      const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
+      const __m128 xx11 = _mm_mul_ps(wk1iv,
+          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
+                                             _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx12 = _mm_add_ps(xx10, xx11);
+
+      const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
+      const __m128 xx21 = _mm_mul_ps(wk3iv,
+          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
+                           _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx22 = _mm_add_ps(xx20, xx21);
+
+      _mm_storel_epi64((__m128i*)&a[j0 +  0], _mm_castps_si128(xx));
+      _mm_storel_epi64((__m128i*)&a[j0 + 32],
+                         _mm_shuffle_epi32(_mm_castps_si128(xx),
+                                           _MM_SHUFFLE(3, 2, 3, 2)));
+
+      _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
+      _mm_storel_epi64((__m128i*)&a[j0 + 48],
+                        _mm_shuffle_epi32(_mm_castps_si128(xx4),
+                                          _MM_SHUFFLE(3, 2, 3, 2)));
+
+      _mm_storel_epi64((__m128i*)&a[j0 +  8], _mm_castps_si128(xx12));
+      _mm_storel_epi64((__m128i*)&a[j0 + 40],
+                       _mm_shuffle_epi32(_mm_castps_si128(xx12),
+                                         _MM_SHUFFLE(3, 2, 3, 2)));
+
+      _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
+      _mm_storel_epi64((__m128i*)&a[j0 + 56],
+                       _mm_shuffle_epi32(_mm_castps_si128(xx22),
+                                         _MM_SHUFFLE(3, 2, 3, 2)));
+    }
+  }
+}
+
+static void rftfsub_128_SSE2(float *a) {
+  const float *c = rdft_w + 32;
+  int j1, j2, k1, k2;
+  float wkr, wki, xr, xi, yr, yi;
+
+  static const ALIGN16_BEG float ALIGN16_END k_half[4] =
+    {0.5f, 0.5f, 0.5f, 0.5f};
+  const __m128 mm_half = _mm_load_ps(k_half);
+
+  // Vectorized code (four at once).
+  //    Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const __m128 c_j1 = _mm_loadu_ps(&c[     j1]);         //  1,  2,  3,  4,
+    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);         // 28, 29, 30, 31,
+    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);         // 28, 29, 30, 31,
+    const __m128 wkr_ =
+      _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
+    const __m128 wki_ = c_j1;                              //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    const __m128 a_j2_0 = _mm_loadu_ps(&a[0   + j2]);  //   2,   3,   4,   5,
+    const __m128 a_j2_4 = _mm_loadu_ps(&a[4   + j2]);  //   6,   7,   8,   9,
+    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
+    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
+    const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4,
+                            _MM_SHUFFLE(2, 0, 2 ,0));  //   2,   4,   6,   8,
+    const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4,
+                            _MM_SHUFFLE(3, 1, 3 ,1));  //   3,   5,   7,   9,
+    const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0,
+                            _MM_SHUFFLE(0, 2, 0 ,2));  // 126, 124, 122, 120,
+    const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0,
+                            _MM_SHUFFLE(1, 3, 1 ,3));  // 127, 125, 123, 121,
+    // Calculate 'x'.
+    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
+                                               // 2-126, 4-124, 6-122, 8-120,
+    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
+                                               // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr - wki * xi;
+    //    yi = wkr * xi + wki * xr;
+    const __m128 a_ = _mm_mul_ps(wkr_, xr_);
+    const __m128 b_ = _mm_mul_ps(wki_, xi_);
+    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
+    const __m128 d_ = _mm_mul_ps(wki_, xr_);
+    const __m128 yr_ = _mm_sub_ps(a_, b_);     // 2-126, 4-124, 6-122, 8-120,
+    const __m128 yi_ = _mm_add_ps(c_, d_);     // 3-127, 5-125, 7-123, 9-121,
+    // Update 'a'.
+    //    a[j2 + 0] -= yr;
+    //    a[j2 + 1] -= yi;
+    //    a[k2 + 0] += yr;
+    //    a[k2 + 1] -= yi;
+    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
+    const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_);  //   3,   5,   7,   9,
+    const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
+    const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_);  // 127, 125, 123, 121,
+    // Shuffle in right order and store.
+    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
+                                                       //   2,   3,   4,   5,
+    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
+                                                       //   6,   7,   8,   9,
+    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
+                                                       // 122, 123, 120, 121,
+    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
+                                                       // 126, 127, 124, 125,
+    const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt,
+                            _MM_SHUFFLE(1, 0, 3 ,2));  // 120, 121, 122, 123,
+    const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt,
+                            _MM_SHUFFLE(1, 0, 3 ,2));  // 124, 125, 126, 127,
+    _mm_storeu_ps(&a[0   + j2], a_j2_0n);
+    _mm_storeu_ps(&a[4   + j2], a_j2_4n);
+    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
+    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
+  }
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    k2 = 128 - j2;
+    k1 =  32 - j1;
+    wkr = 0.5f - c[k1];
+    wki = c[j1];
+    xr = a[j2 + 0] - a[k2 + 0];
+    xi = a[j2 + 1] + a[k2 + 1];
+    yr = wkr * xr - wki * xi;
+    yi = wkr * xi + wki * xr;
+    a[j2 + 0] -= yr;
+    a[j2 + 1] -= yi;
+    a[k2 + 0] += yr;
+    a[k2 + 1] -= yi;
+  }
+}
+
+static void rftbsub_128_SSE2(float *a) {
+  const float *c = rdft_w + 32;
+  int j1, j2, k1, k2;
+  float wkr, wki, xr, xi, yr, yi;
+
+  static const ALIGN16_BEG float ALIGN16_END k_half[4] =
+    {0.5f, 0.5f, 0.5f, 0.5f};
+  const __m128 mm_half = _mm_load_ps(k_half);
+
+  a[1] = -a[1];
+  // Vectorized code (four at once).
+  //    Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const __m128 c_j1 = _mm_loadu_ps(&c[     j1]);         //  1,  2,  3,  4,
+    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);         // 28, 29, 30, 31,
+    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);         // 28, 29, 30, 31,
+    const __m128 wkr_ =
+      _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
+    const __m128 wki_ = c_j1;                              //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    const __m128 a_j2_0 = _mm_loadu_ps(&a[0   + j2]);  //   2,   3,   4,   5,
+    const __m128 a_j2_4 = _mm_loadu_ps(&a[4   + j2]);  //   6,   7,   8,   9,
+    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
+    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
+    const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4,
+                            _MM_SHUFFLE(2, 0, 2 ,0));  //   2,   4,   6,   8,
+    const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4,
+                            _MM_SHUFFLE(3, 1, 3 ,1));  //   3,   5,   7,   9,
+    const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0,
+                            _MM_SHUFFLE(0, 2, 0 ,2));  // 126, 124, 122, 120,
+    const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0,
+                            _MM_SHUFFLE(1, 3, 1 ,3));  // 127, 125, 123, 121,
+    // Calculate 'x'.
+    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
+                                               // 2-126, 4-124, 6-122, 8-120,
+    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
+                                               // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr + wki * xi;
+    //    yi = wkr * xi - wki * xr;
+    const __m128 a_ = _mm_mul_ps(wkr_, xr_);
+    const __m128 b_ = _mm_mul_ps(wki_, xi_);
+    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
+    const __m128 d_ = _mm_mul_ps(wki_, xr_);
+    const __m128 yr_ = _mm_add_ps(a_, b_);     // 2-126, 4-124, 6-122, 8-120,
+    const __m128 yi_ = _mm_sub_ps(c_, d_);     // 3-127, 5-125, 7-123, 9-121,
+    // Update 'a'.
+    //    a[j2 + 0] = a[j2 + 0] - yr;
+    //    a[j2 + 1] = yi - a[j2 + 1];
+    //    a[k2 + 0] = yr + a[k2 + 0];
+    //    a[k2 + 1] = yi - a[k2 + 1];
+    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
+    const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1);  //   3,   5,   7,   9,
+    const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
+    const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1);  // 127, 125, 123, 121,
+    // Shuffle in right order and store.
+    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
+                                                       //   2,   3,   4,   5,
+    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
+                                                       //   6,   7,   8,   9,
+    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
+                                                       // 122, 123, 120, 121,
+    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
+                                                       // 126, 127, 124, 125,
+    const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt,
+                            _MM_SHUFFLE(1, 0, 3 ,2));  // 120, 121, 122, 123,
+    const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt,
+                            _MM_SHUFFLE(1, 0, 3 ,2));  // 124, 125, 126, 127,
+    _mm_storeu_ps(&a[0   + j2], a_j2_0n);
+    _mm_storeu_ps(&a[4   + j2], a_j2_4n);
+    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
+    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
+  }
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    k2 = 128 - j2;
+    k1 =  32 - j1;
+    wkr = 0.5f - c[k1];
+    wki = c[j1];
+    xr = a[j2 + 0] - a[k2 + 0];
+    xi = a[j2 + 1] + a[k2 + 1];
+    yr = wkr * xr + wki * xi;
+    yi = wkr * xi - wki * xr;
+    a[j2 + 0] = a[j2 + 0] - yr;
+    a[j2 + 1] = yi - a[j2 + 1];
+    a[k2 + 0] = yr + a[k2 + 0];
+    a[k2 + 1] = yi - a[k2 + 1];
+  }
+  a[65] = -a[65];
+}
+
+void aec_rdft_init_sse2(void) {
+  cft1st_128 = cft1st_128_SSE2;
+  cftmdl_128 = cftmdl_128_SSE2;
+  rftfsub_128 = rftfsub_128_SSE2;
+  rftbsub_128 = rftbsub_128_SSE2;
+}
+
+#endif  // WEBRTC_USE_SS2
--- a/webrtc/modules/audio_processing/aec/echo_cancellation.c
+++ b/webrtc/modules/audio_processing/aec/echo_cancellation.c
@@ -0,0 +1,901 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Contains the API functions for the AEC.
+ */
+#include "echo_cancellation.h"
+
+#include <math.h>
+#ifdef AEC_DEBUG
+#include <stdio.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+
+#include "aec_core.h"
+#include "resampler.h"
+#include "ring_buffer.h"
+
+#define BUF_SIZE_FRAMES 50 // buffer size (frames)
+// Maximum length of resampled signal. Must be an integer multiple of frames
+// (ceil(1/(1 + MIN_SKEW)*2) + 1)*FRAME_LEN
+// The factor of 2 handles wb, and the + 1 is as a safety margin
+#define MAX_RESAMP_LEN (5 * FRAME_LEN)
+
+static const int bufSizeSamp = BUF_SIZE_FRAMES * FRAME_LEN; // buffer size (samples)
+static const int sampMsNb = 8; // samples per ms in nb
+// Target suppression levels for nlp modes
+// log{0.001, 0.00001, 0.00000001}
+static const float targetSupp[3] = {-6.9f, -11.5f, -18.4f};
+static const float minOverDrive[3] = {1.0f, 2.0f, 5.0f};
+static const int initCheck = 42;
+
+typedef struct {
+    int delayCtr;
+    int sampFreq;
+    int splitSampFreq;
+    int scSampFreq;
+    float sampFactor; // scSampRate / sampFreq
+    short nlpMode;
+    short autoOnOff;
+    short activity;
+    short skewMode;
+    short bufSizeStart;
+    //short bufResetCtr;  // counts number of noncausal frames
+    int knownDelay;
+
+    // Stores the last frame added to the farend buffer
+    short farendOld[2][FRAME_LEN];
+    short initFlag; // indicates if AEC has been initialized
+
+    // Variables used for averaging far end buffer size
+    short counter;
+    short sum;
+    short firstVal;
+    short checkBufSizeCtr;
+
+    // Variables used for delay shifts
+    short msInSndCardBuf;
+    short filtDelay;
+    int timeForDelayChange;
+    int ECstartup;
+    int checkBuffSize;
+    int delayChange;
+    short lastDelayDiff;
+
+#ifdef AEC_DEBUG
+    FILE *bufFile;
+    FILE *delayFile;
+    FILE *skewFile;
+    FILE *preCompFile;
+    FILE *postCompFile;
+#endif // AEC_DEBUG
+
+    // Structures
+    void *farendBuf;
+    void *resampler;
+
+    int skewFrCtr;
+    int resample; // if the skew is small enough we don't resample
+    int highSkewCtr;
+    float skew;
+
+    int lastError;
+
+    aec_t *aec;
+} aecpc_t;
+
+// Estimates delay to set the position of the farend buffer read pointer
+// (controlled by knownDelay)
+static int EstBufDelay(aecpc_t *aecInst, short msInSndCardBuf);
+
+// Stuffs the farend buffer if the estimated delay is too large
+static int DelayComp(aecpc_t *aecInst);
+
+WebRtc_Word32 WebRtcAec_Create(void **aecInst)
+{
+    aecpc_t *aecpc;
+    if (aecInst == NULL) {
+        return -1;
+    }
+
+    aecpc = malloc(sizeof(aecpc_t));
+    *aecInst = aecpc;
+    if (aecpc == NULL) {
+        return -1;
+    }
+
+    if (WebRtcAec_CreateAec(&aecpc->aec) == -1) {
+        WebRtcAec_Free(aecpc);
+        aecpc = NULL;
+        return -1;
+    }
+
+    if (WebRtcApm_CreateBuffer(&aecpc->farendBuf, bufSizeSamp) == -1) {
+        WebRtcAec_Free(aecpc);
+        aecpc = NULL;
+        return -1;
+    }
+
+    if (WebRtcAec_CreateResampler(&aecpc->resampler) == -1) {
+        WebRtcAec_Free(aecpc);
+        aecpc = NULL;
+        return -1;
+    }
+
+    aecpc->initFlag = 0;
+    aecpc->lastError = 0;
+
+#ifdef AEC_DEBUG
+    aecpc->aec->farFile = fopen("aecFar.pcm","wb");
+    aecpc->aec->nearFile = fopen("aecNear.pcm","wb");
+    aecpc->aec->outFile = fopen("aecOut.pcm","wb");
+    aecpc->aec->outLpFile = fopen("aecOutLp.pcm","wb");
+
+    aecpc->bufFile = fopen("aecBuf.dat", "wb");
+    aecpc->skewFile = fopen("aecSkew.dat", "wb");
+    aecpc->delayFile = fopen("aecDelay.dat", "wb");
+    aecpc->preCompFile = fopen("preComp.pcm", "wb");
+    aecpc->postCompFile = fopen("postComp.pcm", "wb");
+#endif // AEC_DEBUG
+
+    return 0;
+}
+
+WebRtc_Word32 WebRtcAec_Free(void *aecInst)
+{
+    aecpc_t *aecpc = aecInst;
+
+    if (aecpc == NULL) {
+        return -1;
+    }
+
+#ifdef AEC_DEBUG
+    fclose(aecpc->aec->farFile);
+    fclose(aecpc->aec->nearFile);
+    fclose(aecpc->aec->outFile);
+    fclose(aecpc->aec->outLpFile);
+
+    fclose(aecpc->bufFile);
+    fclose(aecpc->skewFile);
+    fclose(aecpc->delayFile);
+    fclose(aecpc->preCompFile);
+    fclose(aecpc->postCompFile);
+#endif // AEC_DEBUG
+
+    WebRtcAec_FreeAec(aecpc->aec);
+    WebRtcApm_FreeBuffer(aecpc->farendBuf);
+    WebRtcAec_FreeResampler(aecpc->resampler);
+    free(aecpc);
+
+    return 0;
+}
+
+WebRtc_Word32 WebRtcAec_Init(void *aecInst, WebRtc_Word32 sampFreq, WebRtc_Word32 scSampFreq)
+{
+    aecpc_t *aecpc = aecInst;
+    AecConfig aecConfig;
+
+    if (aecpc == NULL) {
+        return -1;
+    }
+
+    if (sampFreq != 8000 && sampFreq != 16000  && sampFreq != 32000) {
+        aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
+        return -1;
+    }
+    aecpc->sampFreq = sampFreq;
+
+    if (scSampFreq < 1 || scSampFreq > 96000) {
+        aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
+        return -1;
+    }
+    aecpc->scSampFreq = scSampFreq;
+
+    // Initialize echo canceller core
+    if (WebRtcAec_InitAec(aecpc->aec, aecpc->sampFreq) == -1) {
+        aecpc->lastError = AEC_UNSPECIFIED_ERROR;
+        return -1;
+    }
+
+    // Initialize farend buffer
+    if (WebRtcApm_InitBuffer(aecpc->farendBuf) == -1) {
+        aecpc->lastError = AEC_UNSPECIFIED_ERROR;
+        return -1;
+    }
+
+    if (WebRtcAec_InitResampler(aecpc->resampler, aecpc->scSampFreq) == -1) {
+        aecpc->lastError = AEC_UNSPECIFIED_ERROR;
+        return -1;
+    }
+
+    aecpc->initFlag = initCheck;  // indicates that initialization has been done
+
+    if (aecpc->sampFreq == 32000) {
+        aecpc->splitSampFreq = 16000;
+    }
+    else {
+        aecpc->splitSampFreq = sampFreq;
+    }
+
+    aecpc->skewFrCtr = 0;
+    aecpc->activity = 0;
+
+    aecpc->delayChange = 1;
+    aecpc->delayCtr = 0;
+
+    aecpc->sum = 0;
+    aecpc->counter = 0;
+    aecpc->checkBuffSize = 1;
+    aecpc->firstVal = 0;
+
+    aecpc->ECstartup = 1;
+    aecpc->bufSizeStart = 0;
+    aecpc->checkBufSizeCtr = 0;
+    aecpc->filtDelay = 0;
+    aecpc->timeForDelayChange =0;
+    aecpc->knownDelay = 0;
+    aecpc->lastDelayDiff = 0;
+
+    aecpc->skew = 0;
+    aecpc->resample = kAecFalse;
+    aecpc->highSkewCtr = 0;
+    aecpc->sampFactor = (aecpc->scSampFreq * 1.0f) / aecpc->splitSampFreq;
+
+    memset(&aecpc->farendOld[0][0], 0, 160);
+
+    // Default settings.
+    aecConfig.nlpMode = kAecNlpModerate;
+    aecConfig.skewMode = kAecFalse;
+    aecConfig.metricsMode = kAecFalse;
+    aecConfig.delay_logging = kAecFalse;
+
+    if (WebRtcAec_set_config(aecpc, aecConfig) == -1) {
+        aecpc->lastError = AEC_UNSPECIFIED_ERROR;
+        return -1;
+    }
+
+    return 0;
+}
+
+// only buffer L band for farend
+WebRtc_Word32 WebRtcAec_BufferFarend(void *aecInst, const WebRtc_Word16 *farend,
+    WebRtc_Word16 nrOfSamples)
+{
+    aecpc_t *aecpc = aecInst;
+    WebRtc_Word32 retVal = 0;
+    short newNrOfSamples;
+    short newFarend[MAX_RESAMP_LEN];
+    float skew;
+
+    if (aecpc == NULL) {
+        return -1;
+    }
+
+    if (farend == NULL) {
+        aecpc->lastError = AEC_NULL_POINTER_ERROR;
+        return -1;
+    }
+
+    if (aecpc->initFlag != initCheck) {
+        aecpc->lastError = AEC_UNINITIALIZED_ERROR;
+        return -1;
+    }
+
+    // number of samples == 160 for SWB input
+    if (nrOfSamples != 80 && nrOfSamples != 160) {
+        aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
+        return -1;
+    }
+
+    skew = aecpc->skew;
+
+    // TODO: Is this really a good idea?
+    if (!aecpc->ECstartup) {
+        DelayComp(aecpc);
+    }
+
+    if (aecpc->skewMode == kAecTrue && aecpc->resample == kAecTrue) {
+        // Resample and get a new number of samples
+        newNrOfSamples = WebRtcAec_ResampleLinear(aecpc->resampler,
+                                                  farend,
+                                                  nrOfSamples,
+                                                  skew,
+                                                  newFarend);
+        WebRtcApm_WriteBuffer(aecpc->farendBuf, newFarend, newNrOfSamples);
+
+#ifdef AEC_DEBUG
+        fwrite(farend, 2, nrOfSamples, aecpc->preCompFile);
+        fwrite(newFarend, 2, newNrOfSamples, aecpc->postCompFile);
+#endif
+    }
+    else {
+        WebRtcApm_WriteBuffer(aecpc->farendBuf, farend, nrOfSamples);
+    }
+
+    return retVal;
+}
+
+WebRtc_Word32 WebRtcAec_Process(void *aecInst, const WebRtc_Word16 *nearend,
+    const WebRtc_Word16 *nearendH, WebRtc_Word16 *out, WebRtc_Word16 *outH,
+    WebRtc_Word16 nrOfSamples, WebRtc_Word16 msInSndCardBuf, WebRtc_Word32 skew)
+{
+    aecpc_t *aecpc = aecInst;
+    WebRtc_Word32 retVal = 0;
+    short i;
+    short farend[FRAME_LEN];
+    short nmbrOfFilledBuffers;
+    short nBlocks10ms;
+    short nFrames;
+#ifdef AEC_DEBUG
+    short msInAECBuf;
+#endif
+    // Limit resampling to doubling/halving of signal
+    const float minSkewEst = -0.5f;
+    const float maxSkewEst = 1.0f;
+
+    if (aecpc == NULL) {
+        return -1;
+    }
+
+    if (nearend == NULL) {
+        aecpc->lastError = AEC_NULL_POINTER_ERROR;
+        return -1;
+    }
+
+    if (out == NULL) {
+        aecpc->lastError = AEC_NULL_POINTER_ERROR;
+        return -1;
+    }
+
+    if (aecpc->initFlag != initCheck) {
+        aecpc->lastError = AEC_UNINITIALIZED_ERROR;
+        return -1;
+    }
+
+    // number of samples == 160 for SWB input
+    if (nrOfSamples != 80 && nrOfSamples != 160) {
+        aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
+        return -1;
+    }
+
+    // Check for valid pointers based on sampling rate
+    if (aecpc->sampFreq == 32000 && nearendH == NULL) {
+       aecpc->lastError = AEC_NULL_POINTER_ERROR;
+       return -1;
+    }
+
+    if (msInSndCardBuf < 0) {
+        msInSndCardBuf = 0;
+        aecpc->lastError = AEC_BAD_PARAMETER_WARNING;
+        retVal = -1;
+    }
+    else if (msInSndCardBuf > 500) {
+        msInSndCardBuf = 500;
+        aecpc->lastError = AEC_BAD_PARAMETER_WARNING;
+        retVal = -1;
+    }
+    msInSndCardBuf += 10;
+    aecpc->msInSndCardBuf = msInSndCardBuf;
+
+    if (aecpc->skewMode == kAecTrue) {
+        if (aecpc->skewFrCtr < 25) {
+            aecpc->skewFrCtr++;
+        }
+        else {
+            retVal = WebRtcAec_GetSkew(aecpc->resampler, skew, &aecpc->skew);
+            if (retVal == -1) {
+                aecpc->skew = 0;
+                aecpc->lastError = AEC_BAD_PARAMETER_WARNING;
+            }
+
+            aecpc->skew /= aecpc->sampFactor*nrOfSamples;
+
+            if (aecpc->skew < 1.0e-3 && aecpc->skew > -1.0e-3) {
+                aecpc->resample = kAecFalse;
+            }
+            else {
+                aecpc->resample = kAecTrue;
+            }
+
+            if (aecpc->skew < minSkewEst) {
+                aecpc->skew = minSkewEst;
+            }
+            else if (aecpc->skew > maxSkewEst) {
+                aecpc->skew = maxSkewEst;
+            }
+
+#ifdef AEC_DEBUG
+            fwrite(&aecpc->skew, sizeof(aecpc->skew), 1, aecpc->skewFile);
+#endif
+        }
+    }
+
+    nFrames = nrOfSamples / FRAME_LEN;
+    nBlocks10ms = nFrames / aecpc->aec->mult;
+
+    if (aecpc->ECstartup) {
+        if (nearend != out) {
+            // Only needed if they don't already point to the same place.
+            memcpy(out, nearend, sizeof(short) * nrOfSamples);
+        }
+        nmbrOfFilledBuffers = WebRtcApm_get_buffer_size(aecpc->farendBuf) / FRAME_LEN;
+
+        // The AEC is in the start up mode
+        // AEC is disabled until the soundcard buffer and farend buffers are OK
+
+        // Mechanism to ensure that the soundcard buffer is reasonably stable.
+        if (aecpc->checkBuffSize) {
+
+            aecpc->checkBufSizeCtr++;
+            // Before we fill up the far end buffer we require the amount of data on the
+            // sound card to be stable (+/-8 ms) compared to the first value. This
+            // comparison is made during the following 4 consecutive frames. If it seems
+            // to be stable then we start to fill up the far end buffer.
+
+            if (aecpc->counter == 0) {
+                aecpc->firstVal = aecpc->msInSndCardBuf;
+                aecpc->sum = 0;
+            }
+
+            if (abs(aecpc->firstVal - aecpc->msInSndCardBuf) <
+                WEBRTC_SPL_MAX(0.2 * aecpc->msInSndCardBuf, sampMsNb)) {
+                aecpc->sum += aecpc->msInSndCardBuf;
+                aecpc->counter++;
+            }
+            else {
+                aecpc->counter = 0;
+            }
+
+            if (aecpc->counter*nBlocks10ms >= 6) {
+                // The farend buffer size is determined in blocks of 80 samples
+                // Use 75% of the average value of the soundcard buffer
+                aecpc->bufSizeStart = WEBRTC_SPL_MIN((int) (0.75 * (aecpc->sum *
+                    aecpc->aec->mult) / (aecpc->counter * 10)), BUF_SIZE_FRAMES);
+                // buffersize has now been determined
+                aecpc->checkBuffSize = 0;
+            }
+
+            if (aecpc->checkBufSizeCtr * nBlocks10ms > 50) {
+                // for really bad sound cards, don't disable echocanceller for more than 0.5 sec
+                aecpc->bufSizeStart = WEBRTC_SPL_MIN((int) (0.75 * (aecpc->msInSndCardBuf *
+                    aecpc->aec->mult) / 10), BUF_SIZE_FRAMES);
+                aecpc->checkBuffSize = 0;
+            }
+        }
+
+        // if checkBuffSize changed in the if-statement above
+        if (!aecpc->checkBuffSize) {
+            // soundcard buffer is now reasonably stable
+            // When the far end buffer is filled with approximately the same amount of
+            // data as the amount on the sound card we end the start up phase and start
+            // to cancel echoes.
+
+            if (nmbrOfFilledBuffers == aecpc->bufSizeStart) {
+                aecpc->ECstartup = 0;  // Enable the AEC
+            }
+            else if (nmbrOfFilledBuffers > aecpc->bufSizeStart) {
+                WebRtcApm_FlushBuffer(aecpc->farendBuf, WebRtcApm_get_buffer_size(aecpc->farendBuf) -
+                    aecpc->bufSizeStart * FRAME_LEN);
+                aecpc->ECstartup = 0;
+            }
+        }
+
+    }
+    else {
+        // AEC is enabled
+
+        // Note only 1 block supported for nb and 2 blocks for wb
+        for (i = 0; i < nFrames; i++) {
+            nmbrOfFilledBuffers = WebRtcApm_get_buffer_size(aecpc->farendBuf) / FRAME_LEN;
+
+            // Check that there is data in the far end buffer
+            if (nmbrOfFilledBuffers > 0) {
+                // Get the next 80 samples from the farend buffer
+                WebRtcApm_ReadBuffer(aecpc->farendBuf, farend, FRAME_LEN);
+
+                // Always store the last frame for use when we run out of data
+                memcpy(&(aecpc->farendOld[i][0]), farend, FRAME_LEN * sizeof(short));
+            }
+            else {
+                // We have no data so we use the last played frame
+                memcpy(farend, &(aecpc->farendOld[i][0]), FRAME_LEN * sizeof(short));
+            }
+
+            // Call buffer delay estimator when all data is extracted,
+            // i.e. i = 0 for NB and i = 1 for WB or SWB
+            if ((i == 0 && aecpc->splitSampFreq == 8000) ||
+                    (i == 1 && (aecpc->splitSampFreq == 16000))) {
+                EstBufDelay(aecpc, aecpc->msInSndCardBuf);
+            }
+
+            // Call the AEC
+           WebRtcAec_ProcessFrame(aecpc->aec, farend, &nearend[FRAME_LEN * i], &nearendH[FRAME_LEN * i],
+               &out[FRAME_LEN * i], &outH[FRAME_LEN * i], aecpc->knownDelay);
+        }
+    }
+
+#ifdef AEC_DEBUG
+    msInAECBuf = WebRtcApm_get_buffer_size(aecpc->farendBuf) / (sampMsNb*aecpc->aec->mult);
+    fwrite(&msInAECBuf, 2, 1, aecpc->bufFile);
+    fwrite(&(aecpc->knownDelay), sizeof(aecpc->knownDelay), 1, aecpc->delayFile);
+#endif
+
+    return retVal;
+}
+
+WebRtc_Word32 WebRtcAec_set_config(void *aecInst, AecConfig config)
+{
+    aecpc_t *aecpc = aecInst;
+
+    if (aecpc == NULL) {
+        return -1;
+    }
+
+    if (aecpc->initFlag != initCheck) {
+        aecpc->lastError = AEC_UNINITIALIZED_ERROR;
+        return -1;
+    }
+
+    if (config.skewMode != kAecFalse && config.skewMode != kAecTrue) {
+        aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
+        return -1;
+    }
+    aecpc->skewMode = config.skewMode;
+
+    if (config.nlpMode != kAecNlpConservative && config.nlpMode !=
+            kAecNlpModerate && config.nlpMode != kAecNlpAggressive) {
+        aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
+        return -1;
+    }
+    aecpc->nlpMode = config.nlpMode;
+    aecpc->aec->targetSupp = targetSupp[aecpc->nlpMode];
+    aecpc->aec->minOverDrive = minOverDrive[aecpc->nlpMode];
+
+    if (config.metricsMode != kAecFalse && config.metricsMode != kAecTrue) {
+        aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
+        return -1;
+    }
+    aecpc->aec->metricsMode = config.metricsMode;
+    if (aecpc->aec->metricsMode == kAecTrue) {
+        WebRtcAec_InitMetrics(aecpc->aec);
+    }
+
+  if (config.delay_logging != kAecFalse && config.delay_logging != kAecTrue) {
+    aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
+    return -1;
+  }
+  aecpc->aec->delay_logging_enabled = config.delay_logging;
+  if (aecpc->aec->delay_logging_enabled == kAecTrue) {
+    memset(aecpc->aec->delay_histogram, 0, sizeof(aecpc->aec->delay_histogram));
+  }
+
+    return 0;
+}
+
+WebRtc_Word32 WebRtcAec_get_config(void *aecInst, AecConfig *config)
+{
+    aecpc_t *aecpc = aecInst;
+
+    if (aecpc == NULL) {
+        return -1;
+    }
+
+    if (config == NULL) {
+        aecpc->lastError = AEC_NULL_POINTER_ERROR;
+        return -1;
+    }
+
+    if (aecpc->initFlag != initCheck) {
+        aecpc->lastError = AEC_UNINITIALIZED_ERROR;
+        return -1;
+    }
+
+    config->nlpMode = aecpc->nlpMode;
+    config->skewMode = aecpc->skewMode;
+    config->metricsMode = aecpc->aec->metricsMode;
+    config->delay_logging = aecpc->aec->delay_logging_enabled;
+
+    return 0;
+}
+
+WebRtc_Word32 WebRtcAec_get_echo_status(void *aecInst, WebRtc_Word16 *status)
+{
+    aecpc_t *aecpc = aecInst;
+
+    if (aecpc == NULL) {
+        return -1;
+    }
+
+    if (status == NULL) {
+        aecpc->lastError = AEC_NULL_POINTER_ERROR;
+        return -1;
+    }
+
+    if (aecpc->initFlag != initCheck) {
+        aecpc->lastError = AEC_UNINITIALIZED_ERROR;
+        return -1;
+    }
+
+    *status = aecpc->aec->echoState;
+
+    return 0;
+}
+
+WebRtc_Word32 WebRtcAec_GetMetrics(void *aecInst, AecMetrics *metrics)
+{
+    const float upweight = 0.7f;
+    float dtmp;
+    short stmp;
+    aecpc_t *aecpc = aecInst;
+
+    if (aecpc == NULL) {
+        return -1;
+    }
+
+    if (metrics == NULL) {
+        aecpc->lastError = AEC_NULL_POINTER_ERROR;
+        return -1;
+    }
+
+    if (aecpc->initFlag != initCheck) {
+        aecpc->lastError = AEC_UNINITIALIZED_ERROR;
+        return -1;
+    }
+
+    // ERL
+    metrics->erl.instant = (short) aecpc->aec->erl.instant;
+
+    if ((aecpc->aec->erl.himean > offsetLevel) && (aecpc->aec->erl.average > offsetLevel)) {
+    // Use a mix between regular average and upper part average
+        dtmp = upweight * aecpc->aec->erl.himean + (1 - upweight) * aecpc->aec->erl.average;
+        metrics->erl.average = (short) dtmp;
+    }
+    else {
+        metrics->erl.average = offsetLevel;
+    }
+
+    metrics->erl.max = (short) aecpc->aec->erl.max;
+
+    if (aecpc->aec->erl.min < (offsetLevel * (-1))) {
+        metrics->erl.min = (short) aecpc->aec->erl.min;
+    }
+    else {
+        metrics->erl.min = offsetLevel;
+    }
+
+    // ERLE
+    metrics->erle.instant = (short) aecpc->aec->erle.instant;
+
+    if ((aecpc->aec->erle.himean > offsetLevel) && (aecpc->aec->erle.average > offsetLevel)) {
+        // Use a mix between regular average and upper part average
+        dtmp =  upweight * aecpc->aec->erle.himean + (1 - upweight) * aecpc->aec->erle.average;
+        metrics->erle.average = (short) dtmp;
+    }
+    else {
+        metrics->erle.average = offsetLevel;
+    }
+
+    metrics->erle.max = (short) aecpc->aec->erle.max;
+
+    if (aecpc->aec->erle.min < (offsetLevel * (-1))) {
+        metrics->erle.min = (short) aecpc->aec->erle.min;
+    } else {
+        metrics->erle.min = offsetLevel;
+    }
+
+    // RERL
+    if ((metrics->erl.average > offsetLevel) && (metrics->erle.average > offsetLevel)) {
+        stmp = metrics->erl.average + metrics->erle.average;
+    }
+    else {
+        stmp = offsetLevel;
+    }
+    metrics->rerl.average = stmp;
+
+    // No other statistics needed, but returned for completeness
+    metrics->rerl.instant = stmp;
+    metrics->rerl.max = stmp;
+    metrics->rerl.min = stmp;
+
+    // A_NLP
+    metrics->aNlp.instant = (short) aecpc->aec->aNlp.instant;
+
+    if ((aecpc->aec->aNlp.himean > offsetLevel) && (aecpc->aec->aNlp.average > offsetLevel)) {
+        // Use a mix between regular average and upper part average
+        dtmp =  upweight * aecpc->aec->aNlp.himean + (1 - upweight) * aecpc->aec->aNlp.average;
+        metrics->aNlp.average = (short) dtmp;
+    }
+    else {
+        metrics->aNlp.average = offsetLevel;
+    }
+
+    metrics->aNlp.max = (short) aecpc->aec->aNlp.max;
+
+    if (aecpc->aec->aNlp.min < (offsetLevel * (-1))) {
+        metrics->aNlp.min = (short) aecpc->aec->aNlp.min;
+    }
+    else {
+        metrics->aNlp.min = offsetLevel;
+    }
+
+    return 0;
+}
+
+int WebRtcAec_GetDelayMetrics(void* handle, int* median, int* std) {
+  aecpc_t* self = handle;
+  int i = 0;
+  int delay_values = 0;
+  int num_delay_values = 0;
+  int my_median = 0;
+  const int kMsPerBlock = (PART_LEN * 1000) / self->splitSampFreq;
+  float l1_norm = 0;
+
+  if (self == NULL) {
+    return -1;
+  }
+  if (median == NULL) {
+    self->lastError = AEC_NULL_POINTER_ERROR;
+    return -1;
+  }
+  if (std == NULL) {
+    self->lastError = AEC_NULL_POINTER_ERROR;
+    return -1;
+  }
+  if (self->initFlag != initCheck) {
+    self->lastError = AEC_UNINITIALIZED_ERROR;
+    return -1;
+  }
+  if (self->aec->delay_logging_enabled == 0) {
+    // Logging disabled
+    self->lastError = AEC_UNSUPPORTED_FUNCTION_ERROR;
+    return -1;
+  }
+
+  // Get number of delay values since last update
+  for (i = 0; i < kMaxDelay; i++) {
+    num_delay_values += self->aec->delay_histogram[i];
+  }
+  if (num_delay_values == 0) {
+    // We have no new delay value data
+    *median = -1;
+    *std = -1;
+    return 0;
+  }
+
+  delay_values = num_delay_values >> 1; // Start value for median count down
+  // Get median of delay values since last update
+  for (i = 0; i < kMaxDelay; i++) {
+    delay_values -= self->aec->delay_histogram[i];
+    if (delay_values < 0) {
+      my_median = i;
+      break;
+    }
+  }
+  *median = my_median * kMsPerBlock;
+
+  // Calculate the L1 norm, with median value as central moment
+  for (i = 0; i < kMaxDelay; i++) {
+    l1_norm += (float) (fabs(i - my_median) * self->aec->delay_histogram[i]);
+  }
+  *std = (int) (l1_norm / (float) num_delay_values + 0.5f) * kMsPerBlock;
+
+  // Reset histogram
+  memset(self->aec->delay_histogram, 0, sizeof(self->aec->delay_histogram));
+
+  return 0;
+}
+
+WebRtc_Word32 WebRtcAec_get_version(WebRtc_Word8 *versionStr, WebRtc_Word16 len)
+{
+    const char version[] = "AEC 2.5.0";
+    const short versionLen = (short)strlen(version) + 1; // +1 for null-termination
+
+    if (versionStr == NULL) {
+        return -1;
+    }
+
+    if (versionLen > len) {
+        return -1;
+    }
+
+    strncpy(versionStr, version, versionLen);
+    return 0;
+}
+
+WebRtc_Word32 WebRtcAec_get_error_code(void *aecInst)
+{
+    aecpc_t *aecpc = aecInst;
+
+    if (aecpc == NULL) {
+        return -1;
+    }
+
+    return aecpc->lastError;
+}
+
+static int EstBufDelay(aecpc_t *aecpc, short msInSndCardBuf)
+{
+    short delayNew, nSampFar, nSampSndCard;
+    short diff;
+
+    nSampFar = WebRtcApm_get_buffer_size(aecpc->farendBuf);
+    nSampSndCard = msInSndCardBuf * sampMsNb * aecpc->aec->mult;
+
+    delayNew = nSampSndCard - nSampFar;
+
+    // Account for resampling frame delay
+    if (aecpc->skewMode == kAecTrue && aecpc->resample == kAecTrue) {
+        delayNew -= kResamplingDelay;
+    }
+
+    if (delayNew < FRAME_LEN) {
+        WebRtcApm_FlushBuffer(aecpc->farendBuf, FRAME_LEN);
+        delayNew += FRAME_LEN;
+    }
+
+    aecpc->filtDelay = WEBRTC_SPL_MAX(0, (short)(0.8*aecpc->filtDelay + 0.2*delayNew));
+
+    diff = aecpc->filtDelay - aecpc->knownDelay;
+    if (diff > 224) {
+        if (aecpc->lastDelayDiff < 96) {
+            aecpc->timeForDelayChange = 0;
+        }
+        else {
+            aecpc->timeForDelayChange++;
+        }
+    }
+    else if (diff < 96 && aecpc->knownDelay > 0) {
+        if (aecpc->lastDelayDiff > 224) {
+            aecpc->timeForDelayChange = 0;
+        }
+        else {
+            aecpc->timeForDelayChange++;
+        }
+    }
+    else {
+        aecpc->timeForDelayChange = 0;
+    }
+    aecpc->lastDelayDiff = diff;
+
+    if (aecpc->timeForDelayChange > 25) {
+        aecpc->knownDelay = WEBRTC_SPL_MAX((int)aecpc->filtDelay - 160, 0);
+    }
+    return 0;
+}
+
+static int DelayComp(aecpc_t *aecpc)
+{
+    int nSampFar, nSampSndCard, delayNew, nSampAdd;
+    const int maxStuffSamp = 10 * FRAME_LEN;
+
+    nSampFar = WebRtcApm_get_buffer_size(aecpc->farendBuf);
+    nSampSndCard = aecpc->msInSndCardBuf * sampMsNb * aecpc->aec->mult;
+    delayNew = nSampSndCard - nSampFar;
+
+    // Account for resampling frame delay
+    if (aecpc->skewMode == kAecTrue && aecpc->resample == kAecTrue) {
+        delayNew -= kResamplingDelay;
+    }
+
+    if (delayNew > FAR_BUF_LEN - FRAME_LEN*aecpc->aec->mult) {
+        // The difference of the buffersizes is larger than the maximum
+        // allowed known delay. Compensate by stuffing the buffer.
+        nSampAdd = (int)(WEBRTC_SPL_MAX((int)(0.5 * nSampSndCard - nSampFar),
+                    FRAME_LEN));
+        nSampAdd = WEBRTC_SPL_MIN(nSampAdd, maxStuffSamp);
+
+        WebRtcApm_StuffBuffer(aecpc->farendBuf, nSampAdd);
+        aecpc->delayChange = 1; // the delay needs to be updated
+    }
+
+    return 0;
+}
--- a/webrtc/modules/audio_processing/aec/interface/echo_cancellation.h
+++ b/webrtc/modules/audio_processing/aec/interface/echo_cancellation.h
@@ -0,0 +1,278 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_INTERFACE_ECHO_CANCELLATION_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_INTERFACE_ECHO_CANCELLATION_H_
+
+#include "typedefs.h"
+
+// Errors
+#define AEC_UNSPECIFIED_ERROR           12000
+#define AEC_UNSUPPORTED_FUNCTION_ERROR  12001
+#define AEC_UNINITIALIZED_ERROR         12002
+#define AEC_NULL_POINTER_ERROR          12003
+#define AEC_BAD_PARAMETER_ERROR         12004
+
+// Warnings
+#define AEC_BAD_PARAMETER_WARNING       12050
+
+enum {
+    kAecNlpConservative = 0,
+    kAecNlpModerate,
+    kAecNlpAggressive
+};
+
+enum {
+    kAecFalse = 0,
+    kAecTrue
+};
+
+typedef struct {
+    WebRtc_Word16 nlpMode;        // default kAecNlpModerate
+    WebRtc_Word16 skewMode;       // default kAecFalse
+    WebRtc_Word16 metricsMode;    // default kAecFalse
+    int delay_logging;            // default kAecFalse
+    //float realSkew;
+} AecConfig;
+
+typedef struct {
+    WebRtc_Word16 instant;
+    WebRtc_Word16 average;
+    WebRtc_Word16 max;
+    WebRtc_Word16 min;
+} AecLevel;
+
+typedef struct {
+    AecLevel rerl;
+    AecLevel erl;
+    AecLevel erle;
+    AecLevel aNlp;
+} AecMetrics;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Allocates the memory needed by the AEC. The memory needs to be initialized
+ * separately using the WebRtcAec_Init() function.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void **aecInst               Pointer to the AEC instance to be created
+ *                              and initialized
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * WebRtc_Word32 return          0: OK
+ *                              -1: error
+ */
+WebRtc_Word32 WebRtcAec_Create(void **aecInst);
+
+/*
+ * This function releases the memory allocated by WebRtcAec_Create().
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void         *aecInst        Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * WebRtc_Word32  return         0: OK
+ *                              -1: error
+ */
+WebRtc_Word32 WebRtcAec_Free(void *aecInst);
+
+/*
+ * Initializes an AEC instance.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void           *aecInst      Pointer to the AEC instance
+ * WebRtc_Word32  sampFreq      Sampling frequency of data
+ * WebRtc_Word32  scSampFreq    Soundcard sampling frequency
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * WebRtc_Word32 return          0: OK
+ *                              -1: error
+ */
+WebRtc_Word32 WebRtcAec_Init(void *aecInst,
+                             WebRtc_Word32 sampFreq,
+                             WebRtc_Word32 scSampFreq);
+
+/*
+ * Inserts an 80 or 160 sample block of data into the farend buffer.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void           *aecInst      Pointer to the AEC instance
+ * WebRtc_Word16  *farend       In buffer containing one frame of
+ *                              farend signal for L band
+ * WebRtc_Word16  nrOfSamples   Number of samples in farend buffer
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * WebRtc_Word32  return         0: OK
+ *                              -1: error
+ */
+WebRtc_Word32 WebRtcAec_BufferFarend(void *aecInst,
+                                     const WebRtc_Word16 *farend,
+                                     WebRtc_Word16 nrOfSamples);
+
+/*
+ * Runs the echo canceller on an 80 or 160 sample blocks of data.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void          *aecInst       Pointer to the AEC instance
+ * WebRtc_Word16 *nearend       In buffer containing one frame of
+ *                              nearend+echo signal for L band
+ * WebRtc_Word16 *nearendH      In buffer containing one frame of
+ *                              nearend+echo signal for H band
+ * WebRtc_Word16 nrOfSamples    Number of samples in nearend buffer
+ * WebRtc_Word16 msInSndCardBuf Delay estimate for sound card and
+ *                              system buffers
+ * WebRtc_Word16 skew           Difference between number of samples played
+ *                              and recorded at the soundcard (for clock skew
+ *                              compensation)
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * WebRtc_Word16  *out          Out buffer, one frame of processed nearend
+ *                              for L band
+ * WebRtc_Word16  *outH         Out buffer, one frame of processed nearend
+ *                              for H band
+ * WebRtc_Word32  return         0: OK
+ *                              -1: error
+ */
+WebRtc_Word32 WebRtcAec_Process(void *aecInst,
+                                const WebRtc_Word16 *nearend,
+                                const WebRtc_Word16 *nearendH,
+                                WebRtc_Word16 *out,
+                                WebRtc_Word16 *outH,
+                                WebRtc_Word16 nrOfSamples,
+                                WebRtc_Word16 msInSndCardBuf,
+                                WebRtc_Word32 skew);
+
+/*
+ * This function enables the user to set certain parameters on-the-fly.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void           *aecInst      Pointer to the AEC instance
+ * AecConfig      config        Config instance that contains all
+ *                              properties to be set
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * WebRtc_Word32  return         0: OK
+ *                              -1: error
+ */
+WebRtc_Word32 WebRtcAec_set_config(void *aecInst, AecConfig config);
+
+/*
+ * Gets the on-the-fly paramters.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void           *aecInst      Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * AecConfig      *config       Pointer to the config instance that
+ *                              all properties will be written to
+ * WebRtc_Word32  return         0: OK
+ *                              -1: error
+ */
+WebRtc_Word32 WebRtcAec_get_config(void *aecInst, AecConfig *config);
+
+/*
+ * Gets the current echo status of the nearend signal.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void           *aecInst      Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * WebRtc_Word16  *status       0: Almost certainly nearend single-talk
+ *                              1: Might not be neared single-talk
+ * WebRtc_Word32  return         0: OK
+ *                              -1: error
+ */
+WebRtc_Word32 WebRtcAec_get_echo_status(void *aecInst, WebRtc_Word16 *status);
+
+/*
+ * Gets the current echo metrics for the session.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void           *aecInst      Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * AecMetrics     *metrics      Struct which will be filled out with the
+ *                              current echo metrics.
+ * WebRtc_Word32  return         0: OK
+ *                              -1: error
+ */
+WebRtc_Word32 WebRtcAec_GetMetrics(void *aecInst, AecMetrics *metrics);
+
+/*
+ * Gets the current delay metrics for the session.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*      handle            Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int*       median            Delay median value.
+ * int*       std               Delay standard deviation.
+ *
+ * int        return             0: OK
+ *                              -1: error
+ */
+int WebRtcAec_GetDelayMetrics(void* handle, int* median, int* std);
+
+/*
+ * Gets the last error code.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void           *aecInst      Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * WebRtc_Word32  return        11000-11100: error code
+ */
+WebRtc_Word32 WebRtcAec_get_error_code(void *aecInst);
+
+/*
+ * Gets a version string.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * char           *versionStr   Pointer to a string array
+ * WebRtc_Word16  len           The maximum length of the string
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * WebRtc_Word8   *versionStr   Pointer to a string array
+ * WebRtc_Word32  return         0: OK
+ *                              -1: error
+ */
+WebRtc_Word32 WebRtcAec_get_version(WebRtc_Word8 *versionStr, WebRtc_Word16 len);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  /* WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_INTERFACE_ECHO_CANCELLATION_H_ */
--- a/webrtc/modules/audio_processing/aec/resampler.c
+++ b/webrtc/modules/audio_processing/aec/resampler.c
@@ -0,0 +1,233 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* Resamples a signal to an arbitrary rate. Used by the AEC to compensate for clock
+ * skew by resampling the farend signal.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "resampler.h"
+#include "aec_core.h"
+
+enum { kFrameBufferSize = FRAME_LEN * 4 };
+enum { kEstimateLengthFrames = 400 };
+
+typedef struct {
+    short buffer[kFrameBufferSize];
+    float position;
+
+    int deviceSampleRateHz;
+    int skewData[kEstimateLengthFrames];
+    int skewDataIndex;
+    float skewEstimate;
+} resampler_t;
+
+static int EstimateSkew(const int* rawSkew,
+                        int size,
+                        int absLimit,
+                        float *skewEst);
+
+int WebRtcAec_CreateResampler(void **resampInst)
+{
+    resampler_t *obj = malloc(sizeof(resampler_t));
+    *resampInst = obj;
+    if (obj == NULL) {
+        return -1;
+    }
+
+    return 0;
+}
+
+int WebRtcAec_InitResampler(void *resampInst, int deviceSampleRateHz)
+{
+    resampler_t *obj = (resampler_t*) resampInst;
+    memset(obj->buffer, 0, sizeof(obj->buffer));
+    obj->position = 0.0;
+
+    obj->deviceSampleRateHz = deviceSampleRateHz;
+    memset(obj->skewData, 0, sizeof(obj->skewData));
+    obj->skewDataIndex = 0;
+    obj->skewEstimate = 0.0;
+
+    return 0;
+}
+
+int WebRtcAec_FreeResampler(void *resampInst)
+{
+    resampler_t *obj = (resampler_t*) resampInst;
+    free(obj);
+
+    return 0;
+}
+
+int WebRtcAec_ResampleLinear(void *resampInst,
+                             const short *inspeech,
+                             int size,
+                             float skew,
+                             short *outspeech)
+{
+    resampler_t *obj = (resampler_t*) resampInst;
+
+    short *y;
+    float be, tnew, interp;
+    int tn, outsize, mm;
+
+    if (size < 0 || size > 2 * FRAME_LEN) {
+        return -1;
+    }
+
+    // Add new frame data in lookahead
+    memcpy(&obj->buffer[FRAME_LEN + kResamplingDelay],
+           inspeech,
+           size * sizeof(short));
+
+    // Sample rate ratio
+    be = 1 + skew;
+
+    // Loop over input frame
+    mm = 0;
+    y = &obj->buffer[FRAME_LEN]; // Point at current frame
+
+    tnew = be * mm + obj->position;
+    tn = (int) tnew;
+
+    while (tn < size) {
+
+        // Interpolation
+        interp = y[tn] + (tnew - tn) * (y[tn+1] - y[tn]);
+
+        if (interp > 32767) {
+            interp = 32767;
+        }
+        else if (interp < -32768) {
+            interp = -32768;
+        }
+
+        outspeech[mm] = (short) interp;
+        mm++;
+
+        tnew = be * mm + obj->position;
+        tn = (int) tnew;
+    }
+
+    outsize = mm;
+    obj->position += outsize * be - size;
+
+    // Shift buffer
+    memmove(obj->buffer,
+            &obj->buffer[size],
+            (kFrameBufferSize - size) * sizeof(short));
+
+    return outsize;
+}
+
+int WebRtcAec_GetSkew(void *resampInst, int rawSkew, float *skewEst)
+{
+    resampler_t *obj = (resampler_t*)resampInst;
+    int err = 0;
+
+    if (obj->skewDataIndex < kEstimateLengthFrames) {
+        obj->skewData[obj->skewDataIndex] = rawSkew;
+        obj->skewDataIndex++;
+    }
+    else if (obj->skewDataIndex == kEstimateLengthFrames) {
+        err = EstimateSkew(obj->skewData,
+                           kEstimateLengthFrames,
+                           obj->deviceSampleRateHz,
+                           skewEst);
+        obj->skewEstimate = *skewEst;
+        obj->skewDataIndex++;
+    }
+    else {
+        *skewEst = obj->skewEstimate;
+    }
+
+    return err;
+}
+
+int EstimateSkew(const int* rawSkew,
+                 int size,
+                 int deviceSampleRateHz,
+                 float *skewEst)
+{
+    const int absLimitOuter = (int)(0.04f * deviceSampleRateHz);
+    const int absLimitInner = (int)(0.0025f * deviceSampleRateHz);
+    int i = 0;
+    int n = 0;
+    float rawAvg = 0;
+    float err = 0;
+    float rawAbsDev = 0;
+    int upperLimit = 0;
+    int lowerLimit = 0;
+    float cumSum = 0;
+    float x = 0;
+    float x2 = 0;
+    float y = 0;
+    float xy = 0;
+    float xAvg = 0;
+    float denom = 0;
+    float skew = 0;
+
+    *skewEst = 0; // Set in case of error below.
+    for (i = 0; i < size; i++) {
+      if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
+        n++;
+        rawAvg += rawSkew[i];
+      }
+    }
+
+    if (n == 0) {
+      return -1;
+    }
+    assert(n > 0);
+    rawAvg /= n;
+
+    for (i = 0; i < size; i++) {
+      if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
+        err = rawSkew[i] - rawAvg;
+        rawAbsDev += err >= 0 ? err : -err;
+      }
+    }
+    assert(n > 0);
+    rawAbsDev /= n;
+    upperLimit = (int)(rawAvg + 5 * rawAbsDev + 1); // +1 for ceiling.
+    lowerLimit = (int)(rawAvg - 5 * rawAbsDev - 1); // -1 for floor.
+
+    n = 0;
+    for (i = 0; i < size; i++) {
+        if ((rawSkew[i] < absLimitInner && rawSkew[i] > -absLimitInner) ||
+            (rawSkew[i] < upperLimit && rawSkew[i] > lowerLimit)) {
+            n++;
+            cumSum += rawSkew[i];
+            x += n;
+            x2 += n*n;
+            y += cumSum;
+            xy += n * cumSum;
+        }
+    }
+
+    if (n == 0) {
+      return -1;
+    }
+    assert(n > 0);
+    xAvg = x / n;
+    denom = x2 - xAvg*x;
+
+    if (denom != 0) {
+        skew = (xy - xAvg*y) / denom;
+    }
+
+    *skewEst = skew;
+    return 0;
+}
--- a/webrtc/modules/audio_processing/aec/resampler.h
+++ b/webrtc/modules/audio_processing/aec/resampler.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_RESAMPLER_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_RESAMPLER_H_
+
+enum { kResamplingDelay = 1 };
+
+// Unless otherwise specified, functions return 0 on success and -1 on error
+int WebRtcAec_CreateResampler(void **resampInst);
+int WebRtcAec_InitResampler(void *resampInst, int deviceSampleRateHz);
+int WebRtcAec_FreeResampler(void *resampInst);
+
+// Estimates skew from raw measurement.
+int WebRtcAec_GetSkew(void *resampInst, int rawSkew, float *skewEst);
+
+// Resamples input using linear interpolation.
+// Returns size of resampled array.
+int WebRtcAec_ResampleLinear(void *resampInst,
+                             const short *inspeech,
+                             int size,
+                             float skew,
+                             short *outspeech);
+
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_RESAMPLER_H_