Update to current webrtc library

This is from the upstream library commit id
3326535126e435f1ba647885ce43a8f0f3d317eb, corresponding to Chromium
88.0.4290.1.
This commit is contained in:
Arun Raghavan
2020-10-12 18:08:02 -04:00
parent b1b02581d3
commit bcec8b0b21
859 changed files with 76187 additions and 49580 deletions

View File

@ -0,0 +1,58 @@
# Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
#
# Use of this source code is governed by a BSD-style license
# that can be found in the ../../../LICENSE file in the root of the source
# tree. An additional intellectual property rights grant can be found
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
import("../../../webrtc.gni")
rtc_library("fft_size_128") {
sources = [
"fft_size_128/ooura_fft.cc",
"fft_size_128/ooura_fft.h",
"fft_size_128/ooura_fft_tables_common.h",
]
deps = [
"../../../rtc_base/system:arch",
"../../../system_wrappers",
]
cflags = []
if (current_cpu == "x86" || current_cpu == "x64") {
sources += [
"fft_size_128/ooura_fft_sse2.cc",
"fft_size_128/ooura_fft_tables_neon_sse2.h",
]
if (is_posix || is_fuchsia) {
cflags += [ "-msse2" ]
}
}
if (rtc_build_with_neon) {
sources += [
"fft_size_128/ooura_fft_neon.cc",
"fft_size_128/ooura_fft_tables_neon_sse2.h",
]
deps += [ "../../../common_audio" ]
if (current_cpu != "arm64") {
# Enable compilation for the NEON instruction set.
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
cflags += [ "-mfpu=neon" ]
}
}
if (current_cpu == "mipsel" && mips_float_abi == "hard") {
sources += [ "fft_size_128/ooura_fft_mips.cc" ]
}
}
rtc_library("fft_size_256") {
sources = [
"fft_size_256/fft4g.cc",
"fft_size_256/fft4g.h",
]
}

View File

@ -0,0 +1,8 @@
/*
* http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
* Copyright Takuya OOURA, 1996-2001
*
* You may use, copy, modify and distribute this code for any purpose (include
* commercial use) and without fee. Please refer to this package when you modify
* this code.
*/

View File

@ -0,0 +1,548 @@
/*
* http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
* Copyright Takuya OOURA, 1996-2001
*
* You may use, copy, modify and distribute this code for any purpose (include
* commercial use) and without fee. Please refer to this package when you modify
* this code.
*
* Changes by the WebRTC authors:
* - Trivial type modifications.
* - Minimal code subset to do rdft of length 128.
* - Optimizations because of known length.
* - Removed the global variables by moving the code in to a class in order
* to make it thread safe.
*
* All changes are covered by the WebRTC license and IP grant:
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h"
#include "rtc_base/system/arch.h"
#include "system_wrappers/include/cpu_features_wrapper.h"
namespace webrtc {
namespace {
#if !(defined(MIPS_FPU_LE) || defined(WEBRTC_HAS_NEON))
static void cft1st_128_C(float* a) {
const int n = 128;
int j, k1, k2;
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
// The processing of the first set of elements was simplified in C to avoid
// some operations (multiplication by zero or one, addition of two elements
// multiplied by the same weight, ...).
x0r = a[0] + a[2];
x0i = a[1] + a[3];
x1r = a[0] - a[2];
x1i = a[1] - a[3];
x2r = a[4] + a[6];
x2i = a[5] + a[7];
x3r = a[4] - a[6];
x3i = a[5] - a[7];
a[0] = x0r + x2r;
a[1] = x0i + x2i;
a[4] = x0r - x2r;
a[5] = x0i - x2i;
a[2] = x1r - x3i;
a[3] = x1i + x3r;
a[6] = x1r + x3i;
a[7] = x1i - x3r;
wk1r = rdft_w[2];
x0r = a[8] + a[10];
x0i = a[9] + a[11];
x1r = a[8] - a[10];
x1i = a[9] - a[11];
x2r = a[12] + a[14];
x2i = a[13] + a[15];
x3r = a[12] - a[14];
x3i = a[13] - a[15];
a[8] = x0r + x2r;
a[9] = x0i + x2i;
a[12] = x2i - x0i;
a[13] = x0r - x2r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[10] = wk1r * (x0r - x0i);
a[11] = wk1r * (x0r + x0i);
x0r = x3i + x1r;
x0i = x3r - x1i;
a[14] = wk1r * (x0i - x0r);
a[15] = wk1r * (x0i + x0r);
k1 = 0;
for (j = 16; j < n; j += 16) {
k1 += 2;
k2 = 2 * k1;
wk2r = rdft_w[k1 + 0];
wk2i = rdft_w[k1 + 1];
wk1r = rdft_w[k2 + 0];
wk1i = rdft_w[k2 + 1];
wk3r = rdft_wk3ri_first[k1 + 0];
wk3i = rdft_wk3ri_first[k1 + 1];
x0r = a[j + 0] + a[j + 2];
x0i = a[j + 1] + a[j + 3];
x1r = a[j + 0] - a[j + 2];
x1i = a[j + 1] - a[j + 3];
x2r = a[j + 4] + a[j + 6];
x2i = a[j + 5] + a[j + 7];
x3r = a[j + 4] - a[j + 6];
x3i = a[j + 5] - a[j + 7];
a[j + 0] = x0r + x2r;
a[j + 1] = x0i + x2i;
x0r -= x2r;
x0i -= x2i;
a[j + 4] = wk2r * x0r - wk2i * x0i;
a[j + 5] = wk2r * x0i + wk2i * x0r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j + 2] = wk1r * x0r - wk1i * x0i;
a[j + 3] = wk1r * x0i + wk1i * x0r;
x0r = x1r + x3i;
x0i = x1i - x3r;
a[j + 6] = wk3r * x0r - wk3i * x0i;
a[j + 7] = wk3r * x0i + wk3i * x0r;
wk1r = rdft_w[k2 + 2];
wk1i = rdft_w[k2 + 3];
wk3r = rdft_wk3ri_second[k1 + 0];
wk3i = rdft_wk3ri_second[k1 + 1];
x0r = a[j + 8] + a[j + 10];
x0i = a[j + 9] + a[j + 11];
x1r = a[j + 8] - a[j + 10];
x1i = a[j + 9] - a[j + 11];
x2r = a[j + 12] + a[j + 14];
x2i = a[j + 13] + a[j + 15];
x3r = a[j + 12] - a[j + 14];
x3i = a[j + 13] - a[j + 15];
a[j + 8] = x0r + x2r;
a[j + 9] = x0i + x2i;
x0r -= x2r;
x0i -= x2i;
a[j + 12] = -wk2i * x0r - wk2r * x0i;
a[j + 13] = -wk2i * x0i + wk2r * x0r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j + 10] = wk1r * x0r - wk1i * x0i;
a[j + 11] = wk1r * x0i + wk1i * x0r;
x0r = x1r + x3i;
x0i = x1i - x3r;
a[j + 14] = wk3r * x0r - wk3i * x0i;
a[j + 15] = wk3r * x0i + wk3i * x0r;
}
}
static void cftmdl_128_C(float* a) {
const int l = 8;
const int n = 128;
const int m = 32;
int j0, j1, j2, j3, k, k1, k2, m2;
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
for (j0 = 0; j0 < l; j0 += 2) {
j1 = j0 + 8;
j2 = j0 + 16;
j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0];
x0i = a[j0 + 1] + a[j1 + 1];
x1r = a[j0 + 0] - a[j1 + 0];
x1i = a[j0 + 1] - a[j1 + 1];
x2r = a[j2 + 0] + a[j3 + 0];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2 + 0] - a[j3 + 0];
x3i = a[j2 + 1] - a[j3 + 1];
a[j0 + 0] = x0r + x2r;
a[j0 + 1] = x0i + x2i;
a[j2 + 0] = x0r - x2r;
a[j2 + 1] = x0i - x2i;
a[j1 + 0] = x1r - x3i;
a[j1 + 1] = x1i + x3r;
a[j3 + 0] = x1r + x3i;
a[j3 + 1] = x1i - x3r;
}
wk1r = rdft_w[2];
for (j0 = m; j0 < l + m; j0 += 2) {
j1 = j0 + 8;
j2 = j0 + 16;
j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0];
x0i = a[j0 + 1] + a[j1 + 1];
x1r = a[j0 + 0] - a[j1 + 0];
x1i = a[j0 + 1] - a[j1 + 1];
x2r = a[j2 + 0] + a[j3 + 0];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2 + 0] - a[j3 + 0];
x3i = a[j2 + 1] - a[j3 + 1];
a[j0 + 0] = x0r + x2r;
a[j0 + 1] = x0i + x2i;
a[j2 + 0] = x2i - x0i;
a[j2 + 1] = x0r - x2r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j1 + 0] = wk1r * (x0r - x0i);
a[j1 + 1] = wk1r * (x0r + x0i);
x0r = x3i + x1r;
x0i = x3r - x1i;
a[j3 + 0] = wk1r * (x0i - x0r);
a[j3 + 1] = wk1r * (x0i + x0r);
}
k1 = 0;
m2 = 2 * m;
for (k = m2; k < n; k += m2) {
k1 += 2;
k2 = 2 * k1;
wk2r = rdft_w[k1 + 0];
wk2i = rdft_w[k1 + 1];
wk1r = rdft_w[k2 + 0];
wk1i = rdft_w[k2 + 1];
wk3r = rdft_wk3ri_first[k1 + 0];
wk3i = rdft_wk3ri_first[k1 + 1];
for (j0 = k; j0 < l + k; j0 += 2) {
j1 = j0 + 8;
j2 = j0 + 16;
j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0];
x0i = a[j0 + 1] + a[j1 + 1];
x1r = a[j0 + 0] - a[j1 + 0];
x1i = a[j0 + 1] - a[j1 + 1];
x2r = a[j2 + 0] + a[j3 + 0];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2 + 0] - a[j3 + 0];
x3i = a[j2 + 1] - a[j3 + 1];
a[j0 + 0] = x0r + x2r;
a[j0 + 1] = x0i + x2i;
x0r -= x2r;
x0i -= x2i;
a[j2 + 0] = wk2r * x0r - wk2i * x0i;
a[j2 + 1] = wk2r * x0i + wk2i * x0r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j1 + 0] = wk1r * x0r - wk1i * x0i;
a[j1 + 1] = wk1r * x0i + wk1i * x0r;
x0r = x1r + x3i;
x0i = x1i - x3r;
a[j3 + 0] = wk3r * x0r - wk3i * x0i;
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
}
wk1r = rdft_w[k2 + 2];
wk1i = rdft_w[k2 + 3];
wk3r = rdft_wk3ri_second[k1 + 0];
wk3i = rdft_wk3ri_second[k1 + 1];
for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
j1 = j0 + 8;
j2 = j0 + 16;
j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0];
x0i = a[j0 + 1] + a[j1 + 1];
x1r = a[j0 + 0] - a[j1 + 0];
x1i = a[j0 + 1] - a[j1 + 1];
x2r = a[j2 + 0] + a[j3 + 0];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2 + 0] - a[j3 + 0];
x3i = a[j2 + 1] - a[j3 + 1];
a[j0 + 0] = x0r + x2r;
a[j0 + 1] = x0i + x2i;
x0r -= x2r;
x0i -= x2i;
a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j1 + 0] = wk1r * x0r - wk1i * x0i;
a[j1 + 1] = wk1r * x0i + wk1i * x0r;
x0r = x1r + x3i;
x0i = x1i - x3r;
a[j3 + 0] = wk3r * x0r - wk3i * x0i;
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
}
}
}
static void rftfsub_128_C(float* a) {
const float* c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2;
k1 = 32 - j1;
wkr = 0.5f - c[k1];
wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0];
xi = a[j2 + 1] + a[k2 + 1];
yr = wkr * xr - wki * xi;
yi = wkr * xi + wki * xr;
a[j2 + 0] -= yr;
a[j2 + 1] -= yi;
a[k2 + 0] += yr;
a[k2 + 1] -= yi;
}
}
static void rftbsub_128_C(float* a) {
const float* c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
a[1] = -a[1];
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2;
k1 = 32 - j1;
wkr = 0.5f - c[k1];
wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0];
xi = a[j2 + 1] + a[k2 + 1];
yr = wkr * xr + wki * xi;
yi = wkr * xi - wki * xr;
a[j2 + 0] = a[j2 + 0] - yr;
a[j2 + 1] = yi - a[j2 + 1];
a[k2 + 0] = yr + a[k2 + 0];
a[k2 + 1] = yi - a[k2 + 1];
}
a[65] = -a[65];
}
#endif
} // namespace
OouraFft::OouraFft(bool sse2_available) {
#if defined(WEBRTC_ARCH_X86_FAMILY)
use_sse2_ = sse2_available;
#else
use_sse2_ = false;
#endif
}
OouraFft::OouraFft() {
#if defined(WEBRTC_ARCH_X86_FAMILY)
use_sse2_ = (GetCPUInfo(kSSE2) != 0);
#else
use_sse2_ = false;
#endif
}
OouraFft::~OouraFft() = default;
void OouraFft::Fft(float* a) const {
float xi;
bitrv2_128(a);
cftfsub_128(a);
rftfsub_128(a);
xi = a[0] - a[1];
a[0] += a[1];
a[1] = xi;
}
void OouraFft::InverseFft(float* a) const {
a[1] = 0.5f * (a[0] - a[1]);
a[0] -= a[1];
rftbsub_128(a);
bitrv2_128(a);
cftbsub_128(a);
}
void OouraFft::cft1st_128(float* a) const {
#if defined(MIPS_FPU_LE)
cft1st_128_mips(a);
#elif defined(WEBRTC_HAS_NEON)
cft1st_128_neon(a);
#elif defined(WEBRTC_ARCH_X86_FAMILY)
if (use_sse2_) {
cft1st_128_SSE2(a);
} else {
cft1st_128_C(a);
}
#else
cft1st_128_C(a);
#endif
}
void OouraFft::cftmdl_128(float* a) const {
#if defined(MIPS_FPU_LE)
cftmdl_128_mips(a);
#elif defined(WEBRTC_HAS_NEON)
cftmdl_128_neon(a);
#elif defined(WEBRTC_ARCH_X86_FAMILY)
if (use_sse2_) {
cftmdl_128_SSE2(a);
} else {
cftmdl_128_C(a);
}
#else
cftmdl_128_C(a);
#endif
}
void OouraFft::rftfsub_128(float* a) const {
#if defined(MIPS_FPU_LE)
rftfsub_128_mips(a);
#elif defined(WEBRTC_HAS_NEON)
rftfsub_128_neon(a);
#elif defined(WEBRTC_ARCH_X86_FAMILY)
if (use_sse2_) {
rftfsub_128_SSE2(a);
} else {
rftfsub_128_C(a);
}
#else
rftfsub_128_C(a);
#endif
}
void OouraFft::rftbsub_128(float* a) const {
#if defined(MIPS_FPU_LE)
rftbsub_128_mips(a);
#elif defined(WEBRTC_HAS_NEON)
rftbsub_128_neon(a);
#elif defined(WEBRTC_ARCH_X86_FAMILY)
if (use_sse2_) {
rftbsub_128_SSE2(a);
} else {
rftbsub_128_C(a);
}
#else
rftbsub_128_C(a);
#endif
}
void OouraFft::cftbsub_128(float* a) const {
int j, j1, j2, j3, l;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
cft1st_128(a);
cftmdl_128(a);
l = 32;
for (j = 0; j < l; j += 2) {
j1 = j + l;
j2 = j1 + l;
j3 = j2 + l;
x0r = a[j] + a[j1];
x0i = -a[j + 1] - a[j1 + 1];
x1r = a[j] - a[j1];
x1i = -a[j + 1] + a[j1 + 1];
x2r = a[j2] + a[j3];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2] - a[j3];
x3i = a[j2 + 1] - a[j3 + 1];
a[j] = x0r + x2r;
a[j + 1] = x0i - x2i;
a[j2] = x0r - x2r;
a[j2 + 1] = x0i + x2i;
a[j1] = x1r - x3i;
a[j1 + 1] = x1i - x3r;
a[j3] = x1r + x3i;
a[j3 + 1] = x1i + x3r;
}
}
void OouraFft::cftfsub_128(float* a) const {
int j, j1, j2, j3, l;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
cft1st_128(a);
cftmdl_128(a);
l = 32;
for (j = 0; j < l; j += 2) {
j1 = j + l;
j2 = j1 + l;
j3 = j2 + l;
x0r = a[j] + a[j1];
x0i = a[j + 1] + a[j1 + 1];
x1r = a[j] - a[j1];
x1i = a[j + 1] - a[j1 + 1];
x2r = a[j2] + a[j3];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2] - a[j3];
x3i = a[j2 + 1] - a[j3 + 1];
a[j] = x0r + x2r;
a[j + 1] = x0i + x2i;
a[j2] = x0r - x2r;
a[j2 + 1] = x0i - x2i;
a[j1] = x1r - x3i;
a[j1 + 1] = x1i + x3r;
a[j3] = x1r + x3i;
a[j3 + 1] = x1i - x3r;
}
}
void OouraFft::bitrv2_128(float* a) const {
/*
Following things have been attempted but are no faster:
(a) Storing the swap indexes in a LUT (index calculations are done
for 'free' while waiting on memory/L1).
(b) Consolidate the load/store of two consecutive floats by a 64 bit
integer (execution is memory/L1 bound).
(c) Do a mix of floats and 64 bit integer to maximize register
utilization (execution is memory/L1 bound).
(d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
(e) Hard-coding of the offsets to completely eliminates index
calculations.
*/
unsigned int j, j1, k, k1;
float xr, xi, yr, yi;
const int ip[4] = {0, 64, 32, 96};
for (k = 0; k < 4; k++) {
for (j = 0; j < k; j++) {
j1 = 2 * j + ip[k];
k1 = 2 * k + ip[j];
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
j1 += 8;
k1 += 16;
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
j1 += 8;
k1 -= 8;
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
j1 += 8;
k1 += 16;
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
}
j1 = 2 * k + 8 + ip[k];
k1 = j1 + 8;
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
}
}
} // namespace webrtc

View File

@ -0,0 +1,64 @@
/*
* Copyright (c) 2016 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_H_
#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_H_
#include "rtc_base/system/arch.h"
namespace webrtc {
#if defined(WEBRTC_ARCH_X86_FAMILY)
void cft1st_128_SSE2(float* a);
void cftmdl_128_SSE2(float* a);
void rftfsub_128_SSE2(float* a);
void rftbsub_128_SSE2(float* a);
#endif
#if defined(MIPS_FPU_LE)
void cft1st_128_mips(float* a);
void cftmdl_128_mips(float* a);
void rftfsub_128_mips(float* a);
void rftbsub_128_mips(float* a);
#endif
#if defined(WEBRTC_HAS_NEON)
void cft1st_128_neon(float* a);
void cftmdl_128_neon(float* a);
void rftfsub_128_neon(float* a);
void rftbsub_128_neon(float* a);
#endif
class OouraFft {
public:
// Ctor allowing the availability of SSE2 support to be specified.
explicit OouraFft(bool sse2_available);
// Deprecated: This Ctor will soon be removed.
OouraFft();
~OouraFft();
void Fft(float* a) const;
void InverseFft(float* a) const;
private:
void cft1st_128(float* a) const;
void cftmdl_128(float* a) const;
void rftfsub_128(float* a) const;
void rftbsub_128(float* a) const;
void cftfsub_128(float* a) const;
void cftbsub_128(float* a) const;
void bitrv2_128(float* a) const;
bool use_sse2_;
};
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,351 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* The rdft AEC algorithm, neon version of speed-critical functions.
*
* Based on the sse2 version.
*/
#include <arm_neon.h>
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h"
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h"
namespace webrtc {
#if defined(WEBRTC_HAS_NEON)
void cft1st_128_neon(float* a) {
const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
int j, k2;
for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
float32x4_t a00v = vld1q_f32(&a[j + 0]);
float32x4_t a04v = vld1q_f32(&a[j + 4]);
float32x4_t a08v = vld1q_f32(&a[j + 8]);
float32x4_t a12v = vld1q_f32(&a[j + 12]);
float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
float32x4_t x0v = vaddq_f32(a01v, a23v);
const float32x4_t x1v = vsubq_f32(a01v, a23v);
const float32x4_t x2v = vaddq_f32(a45v, a67v);
const float32x4_t x3v = vsubq_f32(a45v, a67v);
const float32x4_t x3w = vrev64q_f32(x3v);
float32x4_t x0w;
a01v = vaddq_f32(x0v, x2v);
x0v = vsubq_f32(x0v, x2v);
x0w = vrev64q_f32(x0v);
a45v = vmulq_f32(wk2rv, x0v);
a45v = vmlaq_f32(a45v, wk2iv, x0w);
x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
x0w = vrev64q_f32(x0v);
a23v = vmulq_f32(wk1rv, x0v);
a23v = vmlaq_f32(a23v, wk1iv, x0w);
x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
x0w = vrev64q_f32(x0v);
a67v = vmulq_f32(wk3rv, x0v);
a67v = vmlaq_f32(a67v, wk3iv, x0w);
a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
vst1q_f32(&a[j + 0], a00v);
vst1q_f32(&a[j + 4], a04v);
vst1q_f32(&a[j + 8], a08v);
vst1q_f32(&a[j + 12], a12v);
}
}
void cftmdl_128_neon(float* a) {
int j;
const int l = 8;
const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
for (j = 0; j < l; j += 2) {
const float32x2_t a_00 = vld1_f32(&a[j + 0]);
const float32x2_t a_08 = vld1_f32(&a[j + 8]);
const float32x2_t a_32 = vld1_f32(&a[j + 32]);
const float32x2_t a_40 = vld1_f32(&a[j + 40]);
const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
const float32x2_t a_16 = vld1_f32(&a[j + 16]);
const float32x2_t a_24 = vld1_f32(&a[j + 24]);
const float32x2_t a_48 = vld1_f32(&a[j + 48]);
const float32x2_t a_56 = vld1_f32(&a[j + 56]);
const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
const float32x4_t x1_x3_add =
vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
const float32x4_t x1_x3_sub =
vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
const float32x4_t xx1_rev = vrev64q_f32(xx1);
const float32x4_t yy4_rev = vrev64q_f32(yy4);
vst1_f32(&a[j + 0], vget_low_f32(xx0));
vst1_f32(&a[j + 32], vget_high_f32(xx0));
vst1_f32(&a[j + 16], vget_low_f32(xx1));
vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));
a[j + 48] = -a[j + 48];
vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
vst1_f32(&a[j + 40], vget_low_f32(yy4));
vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
}
{
const int k = 64;
const int k1 = 2;
const int k2 = 2 * k1;
const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
for (j = k; j < l + k; j += 2) {
const float32x2_t a_00 = vld1_f32(&a[j + 0]);
const float32x2_t a_08 = vld1_f32(&a[j + 8]);
const float32x2_t a_32 = vld1_f32(&a[j + 32]);
const float32x2_t a_40 = vld1_f32(&a[j + 40]);
const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
const float32x2_t a_16 = vld1_f32(&a[j + 16]);
const float32x2_t a_24 = vld1_f32(&a[j + 24]);
const float32x2_t a_48 = vld1_f32(&a[j + 48]);
const float32x2_t a_56 = vld1_f32(&a[j + 56]);
const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
const float32x4_t x1_x3_add =
vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
const float32x4_t x1_x3_sub =
vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));
vst1_f32(&a[j + 0], vget_low_f32(xx));
vst1_f32(&a[j + 32], vget_high_f32(xx));
vst1_f32(&a[j + 16], vget_low_f32(xx4));
vst1_f32(&a[j + 48], vget_high_f32(xx4));
vst1_f32(&a[j + 8], vget_low_f32(xx12));
vst1_f32(&a[j + 40], vget_high_f32(xx12));
vst1_f32(&a[j + 24], vget_low_f32(xx22));
vst1_f32(&a[j + 56], vget_high_f32(xx22));
}
}
}
__inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
// A B C D -> C D A B
const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
// C D A B -> D C B A
return vrev64q_f32(rev);
}
void rftfsub_128_neon(float* a) {
const float* c = rdft_w + 32;
int j1, j2;
const float32x4_t mm_half = vdupq_n_f32(0.5f);
// Vectorized code (four at once).
// Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'.
const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'.
// 2, 4, 6, 8, 3, 5, 7, 9
float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
// 120, 122, 124, 126, 121, 123, 125, 127,
const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
// 126, 124, 122, 120
const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
// 127, 125, 123, 121
const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
// Calculate 'x'.
const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
// 2-126, 4-124, 6-122, 8-120,
const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
// 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'.
// yr = wkr * xr - wki * xi;
// yi = wkr * xi + wki * xr;
const float32x4_t a_ = vmulq_f32(wkr_, xr_);
const float32x4_t b_ = vmulq_f32(wki_, xi_);
const float32x4_t c_ = vmulq_f32(wkr_, xi_);
const float32x4_t d_ = vmulq_f32(wki_, xr_);
const float32x4_t yr_ = vsubq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const float32x4_t yi_ = vaddq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] -= yr;
// a[j2 + 1] -= yi;
// a[k2 + 0] += yr;
// a[k2 + 1] -= yi;
// 126, 124, 122, 120,
const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
// 127, 125, 123, 121,
const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);
// Shuffle in right order and store.
const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
// 124, 125, 126, 127, 120, 121, 122, 123
const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
// 2, 4, 6, 8,
a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
// 3, 5, 7, 9,
a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);
// 2, 3, 4, 5, 6, 7, 8, 9,
vst2q_f32(&a[0 + j2], a_j2_p);
vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
}
// Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) {
const int k2 = 128 - j2;
const int k1 = 32 - j1;
const float wkr = 0.5f - c[k1];
const float wki = c[j1];
const float xr = a[j2 + 0] - a[k2 + 0];
const float xi = a[j2 + 1] + a[k2 + 1];
const float yr = wkr * xr - wki * xi;
const float yi = wkr * xi + wki * xr;
a[j2 + 0] -= yr;
a[j2 + 1] -= yi;
a[k2 + 0] += yr;
a[k2 + 1] -= yi;
}
}
void rftbsub_128_neon(float* a) {
const float* c = rdft_w + 32;
int j1, j2;
const float32x4_t mm_half = vdupq_n_f32(0.5f);
a[1] = -a[1];
// Vectorized code (four at once).
// Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'.
const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'.
// 2, 4, 6, 8, 3, 5, 7, 9
float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
// 120, 122, 124, 126, 121, 123, 125, 127,
const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
// 126, 124, 122, 120
const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
// 127, 125, 123, 121
const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
// Calculate 'x'.
const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
// 2-126, 4-124, 6-122, 8-120,
const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
// 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'.
// yr = wkr * xr - wki * xi;
// yi = wkr * xi + wki * xr;
const float32x4_t a_ = vmulq_f32(wkr_, xr_);
const float32x4_t b_ = vmulq_f32(wki_, xi_);
const float32x4_t c_ = vmulq_f32(wkr_, xi_);
const float32x4_t d_ = vmulq_f32(wki_, xr_);
const float32x4_t yr_ = vaddq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const float32x4_t yi_ = vsubq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] -= yr;
// a[j2 + 1] -= yi;
// a[k2 + 0] += yr;
// a[k2 + 1] -= yi;
// 126, 124, 122, 120,
const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
// 127, 125, 123, 121,
const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
// Shuffle in right order and store.
// 2, 3, 4, 5, 6, 7, 8, 9,
const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
// 124, 125, 126, 127, 120, 121, 122, 123
const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
// 2, 4, 6, 8,
a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
// 3, 5, 7, 9,
a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
// 2, 3, 4, 5, 6, 7, 8, 9,
vst2q_f32(&a[0 + j2], a_j2_p);
vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
}
// Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) {
const int k2 = 128 - j2;
const int k1 = 32 - j1;
const float wkr = 0.5f - c[k1];
const float wki = c[j1];
const float xr = a[j2 + 0] - a[k2 + 0];
const float xi = a[j2 + 1] + a[k2 + 1];
const float yr = wkr * xr + wki * xi;
const float yi = wkr * xi - wki * xr;
a[j2 + 0] = a[j2 + 0] - yr;
a[j2 + 1] = yi - a[j2 + 1];
a[k2 + 0] = yr + a[k2 + 0];
a[k2 + 1] = yi - a[k2 + 1];
}
a[65] = -a[65];
}
#endif
} // namespace webrtc

View File

@ -0,0 +1,439 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <emmintrin.h>
#include <xmmintrin.h>
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h"
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h"
#include "rtc_base/system/arch.h"
namespace webrtc {
#if defined(WEBRTC_ARCH_X86_FAMILY)
namespace {
// These intrinsics were unavailable before VS 2008.
// TODO(andrew): move to a common file.
#if defined(_MSC_VER) && _MSC_VER < 1500
static __inline __m128 _mm_castsi128_ps(__m128i a) {
return *(__m128*)&a;
}
static __inline __m128i _mm_castps_si128(__m128 a) {
return *(__m128i*)&a;
}
#endif
} // namespace
void cft1st_128_SSE2(float* a) {
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
int j, k2;
for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
__m128 a00v = _mm_loadu_ps(&a[j + 0]);
__m128 a04v = _mm_loadu_ps(&a[j + 4]);
__m128 a08v = _mm_loadu_ps(&a[j + 8]);
__m128 a12v = _mm_loadu_ps(&a[j + 12]);
__m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
__m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
__m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
__m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));
const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
__m128 x0v = _mm_add_ps(a01v, a23v);
const __m128 x1v = _mm_sub_ps(a01v, a23v);
const __m128 x2v = _mm_add_ps(a45v, a67v);
const __m128 x3v = _mm_sub_ps(a45v, a67v);
__m128 x0w;
a01v = _mm_add_ps(x0v, x2v);
x0v = _mm_sub_ps(x0v, x2v);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
{
const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
a45v = _mm_add_ps(a45_0v, a45_1v);
}
{
__m128 a23_0v, a23_1v;
const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
x0v = _mm_add_ps(x1v, x3s);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
a23_0v = _mm_mul_ps(wk1rv, x0v);
a23_1v = _mm_mul_ps(wk1iv, x0w);
a23v = _mm_add_ps(a23_0v, a23_1v);
x0v = _mm_sub_ps(x1v, x3s);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
}
{
const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
a67v = _mm_add_ps(a67_0v, a67_1v);
}
a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
_mm_storeu_ps(&a[j + 0], a00v);
_mm_storeu_ps(&a[j + 4], a04v);
_mm_storeu_ps(&a[j + 8], a08v);
_mm_storeu_ps(&a[j + 12], a12v);
}
}
void cftmdl_128_SSE2(float* a) {
const int l = 8;
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
int j0;
__m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
for (j0 = 0; j0 < l; j0 += 2) {
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
const __m128 a_00_32 =
_mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_08_40 =
_mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),
_MM_SHUFFLE(1, 0, 1, 0));
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
const __m128 a_16_48 =
_mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_24_56 =
_mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
_mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 yy0 =
_mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 yy1 =
_mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
const __m128 yy3 = _mm_add_ps(yy0, yy2);
const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
_mm_storel_epi64(
(__m128i*)&a[j0 + 32],
_mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
_mm_storel_epi64(
(__m128i*)&a[j0 + 48],
_mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
a[j0 + 48] = -a[j0 + 48];
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
_mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
_mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
_mm_storel_epi64(
(__m128i*)&a[j0 + 56],
_mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
}
{
int k = 64;
int k1 = 2;
int k2 = 2 * k1;
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
for (j0 = k; j0 < l + k; j0 += 2) {
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
const __m128 a_00_32 =
_mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_08_40 =
_mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),
_MM_SHUFFLE(1, 0, 1, 0));
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
const __m128 a_16_48 =
_mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_24_56 =
_mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
const __m128 xx3 = _mm_mul_ps(
wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1),
_MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx4 = _mm_add_ps(xx2, xx3);
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
_mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
const __m128 xx11 = _mm_mul_ps(
wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
_MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx12 = _mm_add_ps(xx10, xx11);
const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
const __m128 xx21 = _mm_mul_ps(
wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
_MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx22 = _mm_add_ps(xx20, xx21);
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
_mm_storel_epi64(
(__m128i*)&a[j0 + 32],
_mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
_mm_storel_epi64(
(__m128i*)&a[j0 + 48],
_mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
_mm_storel_epi64(
(__m128i*)&a[j0 + 40],
_mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
_mm_storel_epi64(
(__m128i*)&a[j0 + 56],
_mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
}
}
}
void rftfsub_128_SSE2(float* a) {
const float* c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,
0.5f};
const __m128 mm_half = _mm_load_ps(k_half);
// Vectorized code (four at once).
// Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'.
const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
const __m128 wkr_ =
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'.
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
const __m128 a_j2_p0 = _mm_shuffle_ps(
a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
const __m128 a_j2_p1 = _mm_shuffle_ps(
a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
const __m128 a_k2_p0 = _mm_shuffle_ps(
a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
const __m128 a_k2_p1 = _mm_shuffle_ps(
a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
// Calculate 'x'.
const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
// 2-126, 4-124, 6-122, 8-120,
const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
// 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'.
// yr = wkr * xr - wki * xi;
// yi = wkr * xi + wki * xr;
const __m128 a_ = _mm_mul_ps(wkr_, xr_);
const __m128 b_ = _mm_mul_ps(wki_, xi_);
const __m128 c_ = _mm_mul_ps(wkr_, xi_);
const __m128 d_ = _mm_mul_ps(wki_, xr_);
const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] -= yr;
// a[j2 + 1] -= yi;
// a[k2 + 0] += yr;
// a[k2 + 1] -= yi;
const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_); // 3, 5, 7, 9,
const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120,
const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_); // 127, 125, 123, 121,
// Shuffle in right order and store.
const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
// 2, 3, 4, 5,
const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
// 6, 7, 8, 9,
const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
// 122, 123, 120, 121,
const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
// 126, 127, 124, 125,
const __m128 a_k2_0n = _mm_shuffle_ps(
a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
const __m128 a_k2_4n = _mm_shuffle_ps(
a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
_mm_storeu_ps(&a[122 - j2], a_k2_0n);
_mm_storeu_ps(&a[126 - j2], a_k2_4n);
}
// Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2;
k1 = 32 - j1;
wkr = 0.5f - c[k1];
wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0];
xi = a[j2 + 1] + a[k2 + 1];
yr = wkr * xr - wki * xi;
yi = wkr * xi + wki * xr;
a[j2 + 0] -= yr;
a[j2 + 1] -= yi;
a[k2 + 0] += yr;
a[k2 + 1] -= yi;
}
}
void rftbsub_128_SSE2(float* a) {
const float* c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,
0.5f};
const __m128 mm_half = _mm_load_ps(k_half);
a[1] = -a[1];
// Vectorized code (four at once).
// Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'.
const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
const __m128 wkr_ =
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'.
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
const __m128 a_j2_p0 = _mm_shuffle_ps(
a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
const __m128 a_j2_p1 = _mm_shuffle_ps(
a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
const __m128 a_k2_p0 = _mm_shuffle_ps(
a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
const __m128 a_k2_p1 = _mm_shuffle_ps(
a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
// Calculate 'x'.
const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
// 2-126, 4-124, 6-122, 8-120,
const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
// 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'.
// yr = wkr * xr + wki * xi;
// yi = wkr * xi - wki * xr;
const __m128 a_ = _mm_mul_ps(wkr_, xr_);
const __m128 b_ = _mm_mul_ps(wki_, xi_);
const __m128 c_ = _mm_mul_ps(wkr_, xi_);
const __m128 d_ = _mm_mul_ps(wki_, xr_);
const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] = a[j2 + 0] - yr;
// a[j2 + 1] = yi - a[j2 + 1];
// a[k2 + 0] = yr + a[k2 + 0];
// a[k2 + 1] = yi - a[k2 + 1];
const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9,
const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120,
const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121,
// Shuffle in right order and store.
const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
// 2, 3, 4, 5,
const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
// 6, 7, 8, 9,
const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
// 122, 123, 120, 121,
const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
// 126, 127, 124, 125,
const __m128 a_k2_0n = _mm_shuffle_ps(
a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
const __m128 a_k2_4n = _mm_shuffle_ps(
a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
_mm_storeu_ps(&a[122 - j2], a_k2_0n);
_mm_storeu_ps(&a[126 - j2], a_k2_4n);
}
// Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2;
k1 = 32 - j1;
wkr = 0.5f - c[k1];
wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0];
xi = a[j2 + 1] + a[k2 + 1];
yr = wkr * xr + wki * xi;
yi = wkr * xi - wki * xr;
a[j2 + 0] = a[j2 + 0] - yr;
a[j2 + 1] = yi - a[j2 + 1];
a[k2 + 0] = yr + a[k2 + 0];
a[k2 + 1] = yi - a[k2 + 1];
}
a[65] = -a[65];
}
#endif
} // namespace webrtc

View File

@ -0,0 +1,54 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_COMMON_H_
#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_COMMON_H_
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
namespace webrtc {
// This tables used to be computed at run-time. For example, refer to:
// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564
// to see the initialization code.
// Constants shared by all paths (C, SSE2, NEON).
const float rdft_w[64] = {
1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f, 0.9238795638f,
0.3826834559f, 0.3826834559f, 0.9238795638f, 0.9807852507f, 0.1950903237f,
0.5555702448f, 0.8314695954f, 0.8314695954f, 0.5555702448f, 0.1950903237f,
0.9807852507f, 0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f,
0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f, 0.9569403529f,
0.2902846634f, 0.4713967443f, 0.8819212914f, 0.7730104327f, 0.6343933344f,
0.0980171412f, 0.9951847196f, 0.7071067691f, 0.4993977249f, 0.4975923598f,
0.4945882559f, 0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f,
0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f, 0.4157347977f,
0.4016037583f, 0.3865052164f, 0.3704755902f, 0.3535533845f, 0.3357794881f,
0.3171966672f, 0.2978496552f, 0.2777851224f, 0.2570513785f, 0.2356983721f,
0.2137775421f, 0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f,
0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f,
};
// Constants used by the C and MIPS paths.
const float rdft_wk3ri_first[16] = {
1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f,
0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f,
0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f,
0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f,
};
const float rdft_wk3ri_second[16] = {
-0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f,
-0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f,
-0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f,
-0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f,
};
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_COMMON_H_

View File

@ -0,0 +1,98 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
#include "rtc_base/system/arch.h"
#ifdef _MSC_VER /* visual c++ */
#define ALIGN16_BEG __declspec(align(16))
#define ALIGN16_END
#else /* gcc or icc */
#define ALIGN16_BEG
#define ALIGN16_END __attribute__((aligned(16)))
#endif
namespace webrtc {
// These tables used to be computed at run-time. For example, refer to:
// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564
// to see the initialization code.
#if defined(WEBRTC_ARCH_X86_FAMILY) || defined(WEBRTC_HAS_NEON)
// Constants used by SSE2 and NEON but initialized in the C path.
const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, 0.923879564f,
0.923879564f, 0.382683456f, 0.382683456f, 0.980785251f, 0.980785251f,
0.555570245f, 0.555570245f, 0.831469595f, 0.831469595f, 0.195090324f,
0.195090324f, 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, 0.956940353f,
0.956940353f, 0.471396744f, 0.471396744f, 0.773010433f, 0.773010433f,
0.098017141f, 0.098017141f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f, 0.707106769f,
0.707106769f, -0.707106769f, -0.707106769f, 0.923879564f, 0.923879564f,
-0.382683456f, -0.382683456f, 0.382683456f, 0.382683456f, -0.923879564f,
-0.923879564f, 0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f, 0.831469595f,
0.831469595f, -0.555570245f, -0.555570245f, 0.195090324f, 0.195090324f,
-0.980785251f, -0.980785251f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f, 0.382683456f,
0.382683456f, -0.923879564f, -0.923879564f, 0.831469536f, 0.831469536f,
-0.980785251f, -0.980785251f, -0.195090353f, -0.195090353f, -0.555570245f,
-0.555570245f, 0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f, 0.634393334f,
0.634393334f, -0.995184720f, -0.995184720f, -0.471396863f, -0.471396863f,
-0.290284693f, -0.290284693f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
-0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, -0.382683456f,
0.382683456f, -0.923879564f, 0.923879564f, -0.195090324f, 0.195090324f,
-0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, -0.980785251f,
0.980785251f, -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
-0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f, -0.290284663f,
0.290284663f, -0.881921291f, 0.881921291f, -0.634393334f, 0.634393334f,
-0.995184720f, 0.995184720f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
-0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f, -0.707106769f,
0.707106769f, -0.707106769f, 0.707106769f, -0.382683456f, 0.382683456f,
-0.923879564f, 0.923879564f, -0.923879564f, 0.923879564f, -0.382683456f,
0.382683456f, -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
-0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, -0.555570245f,
0.555570245f, -0.831469595f, 0.831469595f, -0.980785251f, 0.980785251f,
-0.195090324f, 0.195090324f,
};
ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
-0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, -0.923879564f,
0.923879564f, 0.382683456f, -0.382683456f, -0.555570245f, 0.555570245f,
-0.195090353f, 0.195090353f, -0.980785251f, 0.980785251f, 0.831469536f,
-0.831469536f, -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
-0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f, -0.773010492f,
0.773010492f, 0.098017156f, -0.098017156f, -0.881921172f, 0.881921172f,
0.956940353f, -0.956940353f,
};
ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
0.707106769f,
0.707106769f,
0.707106769f,
-0.707106769f,
};
#endif
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_

View File

@ -0,0 +1,866 @@
/*
* http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
* Copyright Takuya OOURA, 1996-2001
*
* You may use, copy, modify and distribute this code for any purpose (include
* commercial use) and without fee. Please refer to this package when you modify
* this code.
*
* Changes:
* Trivial type modifications by the WebRTC authors.
*/
/*
Fast Fourier/Cosine/Sine Transform
dimension :one
data length :power of 2
decimation :frequency
radix :4, 2
data :inplace
table :use
functions
cdft: Complex Discrete Fourier Transform
rdft: Real Discrete Fourier Transform
ddct: Discrete Cosine Transform
ddst: Discrete Sine Transform
dfct: Cosine Transform of RDFT (Real Symmetric DFT)
dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
function prototypes
void cdft(int, int, float *, int *, float *);
void rdft(size_t, int, float *, size_t *, float *);
void ddct(int, int, float *, int *, float *);
void ddst(int, int, float *, int *, float *);
void dfct(int, float *, float *, int *, float *);
void dfst(int, float *, float *, int *, float *);
-------- Complex DFT (Discrete Fourier Transform) --------
[definition]
<case1>
X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
<case2>
X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
(notes: sum_j=0^n-1 is a summation from j=0 to n-1)
[usage]
<case1>
ip[0] = 0; // first time only
cdft(2*n, 1, a, ip, w);
<case2>
ip[0] = 0; // first time only
cdft(2*n, -1, a, ip, w);
[parameters]
2*n :data length (int)
n >= 1, n = power of 2
a[0...2*n-1] :input/output data (float *)
input data
a[2*j] = Re(x[j]),
a[2*j+1] = Im(x[j]), 0<=j<n
output data
a[2*k] = Re(X[k]),
a[2*k+1] = Im(X[k]), 0<=k<n
ip[0...*] :work area for bit reversal (int *)
length of ip >= 2+sqrt(n)
strictly,
length of ip >=
2+(1<<(int)(log(n+0.5)/log(2))/2).
ip[0],ip[1] are pointers of the cos/sin table.
w[0...n/2-1] :cos/sin table (float *)
w[],ip[] are initialized if ip[0] == 0.
[remark]
Inverse of
cdft(2*n, -1, a, ip, w);
is
cdft(2*n, 1, a, ip, w);
for (j = 0; j <= 2 * n - 1; j++) {
a[j] *= 1.0 / n;
}
.
-------- Real DFT / Inverse of Real DFT --------
[definition]
<case1> RDFT
R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
<case2> IRDFT (excluding scale)
a[k] = (R[0] + R[n/2]*cos(pi*k))/2 +
sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) +
sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
[usage]
<case1>
ip[0] = 0; // first time only
rdft(n, 1, a, ip, w);
<case2>
ip[0] = 0; // first time only
rdft(n, -1, a, ip, w);
[parameters]
n :data length (size_t)
n >= 2, n = power of 2
a[0...n-1] :input/output data (float *)
<case1>
output data
a[2*k] = R[k], 0<=k<n/2
a[2*k+1] = I[k], 0<k<n/2
a[1] = R[n/2]
<case2>
input data
a[2*j] = R[j], 0<=j<n/2
a[2*j+1] = I[j], 0<j<n/2
a[1] = R[n/2]
ip[0...*] :work area for bit reversal (size_t *)
length of ip >= 2+sqrt(n/2)
strictly,
length of ip >=
2+(1<<(int)(log(n/2+0.5)/log(2))/2).
ip[0],ip[1] are pointers of the cos/sin table.
w[0...n/2-1] :cos/sin table (float *)
w[],ip[] are initialized if ip[0] == 0.
[remark]
Inverse of
rdft(n, 1, a, ip, w);
is
rdft(n, -1, a, ip, w);
for (j = 0; j <= n - 1; j++) {
a[j] *= 2.0 / n;
}
.
-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
[definition]
<case1> IDCT (excluding scale)
C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
<case2> DCT
C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
[usage]
<case1>
ip[0] = 0; // first time only
ddct(n, 1, a, ip, w);
<case2>
ip[0] = 0; // first time only
ddct(n, -1, a, ip, w);
[parameters]
n :data length (int)
n >= 2, n = power of 2
a[0...n-1] :input/output data (float *)
output data
a[k] = C[k], 0<=k<n
ip[0...*] :work area for bit reversal (int *)
length of ip >= 2+sqrt(n/2)
strictly,
length of ip >=
2+(1<<(int)(log(n/2+0.5)/log(2))/2).
ip[0],ip[1] are pointers of the cos/sin table.
w[0...n*5/4-1] :cos/sin table (float *)
w[],ip[] are initialized if ip[0] == 0.
[remark]
Inverse of
ddct(n, -1, a, ip, w);
is
a[0] *= 0.5;
ddct(n, 1, a, ip, w);
for (j = 0; j <= n - 1; j++) {
a[j] *= 2.0 / n;
}
.
-------- DST (Discrete Sine Transform) / Inverse of DST --------
[definition]
<case1> IDST (excluding scale)
S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
<case2> DST
S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
[usage]
<case1>
ip[0] = 0; // first time only
ddst(n, 1, a, ip, w);
<case2>
ip[0] = 0; // first time only
ddst(n, -1, a, ip, w);
[parameters]
n :data length (int)
n >= 2, n = power of 2
a[0...n-1] :input/output data (float *)
<case1>
input data
a[j] = A[j], 0<j<n
a[0] = A[n]
output data
a[k] = S[k], 0<=k<n
<case2>
output data
a[k] = S[k], 0<k<n
a[0] = S[n]
ip[0...*] :work area for bit reversal (int *)
length of ip >= 2+sqrt(n/2)
strictly,
length of ip >=
2+(1<<(int)(log(n/2+0.5)/log(2))/2).
ip[0],ip[1] are pointers of the cos/sin table.
w[0...n*5/4-1] :cos/sin table (float *)
w[],ip[] are initialized if ip[0] == 0.
[remark]
Inverse of
ddst(n, -1, a, ip, w);
is
a[0] *= 0.5;
ddst(n, 1, a, ip, w);
for (j = 0; j <= n - 1; j++) {
a[j] *= 2.0 / n;
}
.
-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
[definition]
C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
[usage]
ip[0] = 0; // first time only
dfct(n, a, t, ip, w);
[parameters]
n :data length - 1 (int)
n >= 2, n = power of 2
a[0...n] :input/output data (float *)
output data
a[k] = C[k], 0<=k<=n
t[0...n/2] :work area (float *)
ip[0...*] :work area for bit reversal (int *)
length of ip >= 2+sqrt(n/4)
strictly,
length of ip >=
2+(1<<(int)(log(n/4+0.5)/log(2))/2).
ip[0],ip[1] are pointers of the cos/sin table.
w[0...n*5/8-1] :cos/sin table (float *)
w[],ip[] are initialized if ip[0] == 0.
[remark]
Inverse of
a[0] *= 0.5;
a[n] *= 0.5;
dfct(n, a, t, ip, w);
is
a[0] *= 0.5;
a[n] *= 0.5;
dfct(n, a, t, ip, w);
for (j = 0; j <= n; j++) {
a[j] *= 2.0 / n;
}
.
-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
[definition]
S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
[usage]
ip[0] = 0; // first time only
dfst(n, a, t, ip, w);
[parameters]
n :data length + 1 (int)
n >= 2, n = power of 2
a[0...n-1] :input/output data (float *)
output data
a[k] = S[k], 0<k<n
(a[0] is used for work area)
t[0...n/2-1] :work area (float *)
ip[0...*] :work area for bit reversal (int *)
length of ip >= 2+sqrt(n/4)
strictly,
length of ip >=
2+(1<<(int)(log(n/4+0.5)/log(2))/2).
ip[0],ip[1] are pointers of the cos/sin table.
w[0...n*5/8-1] :cos/sin table (float *)
w[],ip[] are initialized if ip[0] == 0.
[remark]
Inverse of
dfst(n, a, t, ip, w);
is
dfst(n, a, t, ip, w);
for (j = 1; j <= n - 1; j++) {
a[j] *= 2.0 / n;
}
.
Appendix :
The cos/sin table is recalculated when the larger table required.
w[] and ip[] are compatible with all routines.
*/
#include <math.h>
#include <stddef.h>
#include "common_audio/third_party/ooura/fft_size_256/fft4g.h"
namespace webrtc {
namespace {
void makewt(size_t nw, size_t* ip, float* w);
void makect(size_t nc, size_t* ip, float* c);
void bitrv2(size_t n, size_t* ip, float* a);
void cftfsub(size_t n, float* a, float* w);
void cftbsub(size_t n, float* a, float* w);
void cft1st(size_t n, float* a, float* w);
void cftmdl(size_t n, size_t l, float* a, float* w);
void rftfsub(size_t n, float* a, size_t nc, float* c);
void rftbsub(size_t n, float* a, size_t nc, float* c);
/* -------- initializing routines -------- */
void makewt(size_t nw, size_t* ip, float* w) {
size_t j, nwh;
float delta, x, y;
ip[0] = nw;
ip[1] = 1;
if (nw > 2) {
nwh = nw >> 1;
delta = atanf(1.0f) / nwh;
w[0] = 1;
w[1] = 0;
w[nwh] = (float)cos(delta * nwh);
w[nwh + 1] = w[nwh];
if (nwh > 2) {
for (j = 2; j < nwh; j += 2) {
x = (float)cos(delta * j);
y = (float)sin(delta * j);
w[j] = x;
w[j + 1] = y;
w[nw - j] = y;
w[nw - j + 1] = x;
}
bitrv2(nw, ip + 2, w);
}
}
}
void makect(size_t nc, size_t* ip, float* c) {
size_t j, nch;
float delta;
ip[1] = nc;
if (nc > 1) {
nch = nc >> 1;
delta = atanf(1.0f) / nch;
c[0] = (float)cos(delta * nch);
c[nch] = 0.5f * c[0];
for (j = 1; j < nch; j++) {
c[j] = 0.5f * (float)cos(delta * j);
c[nc - j] = 0.5f * (float)sin(delta * j);
}
}
}
/* -------- child routines -------- */
void bitrv2(size_t n, size_t* ip, float* a) {
size_t j, j1, k, k1, l, m, m2;
float xr, xi, yr, yi;
ip[0] = 0;
l = n;
m = 1;
while ((m << 3) < l) {
l >>= 1;
for (j = 0; j < m; j++) {
ip[m + j] = ip[j] + l;
}
m <<= 1;
}
m2 = 2 * m;
if ((m << 3) == l) {
for (k = 0; k < m; k++) {
for (j = 0; j < k; j++) {
j1 = 2 * j + ip[k];
k1 = 2 * k + ip[j];
xr = a[j1];
xi = a[j1 + 1];
yr = a[k1];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 1] = xi;
j1 += m2;
k1 += 2 * m2;
xr = a[j1];
xi = a[j1 + 1];
yr = a[k1];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 1] = xi;
j1 += m2;
k1 -= m2;
xr = a[j1];
xi = a[j1 + 1];
yr = a[k1];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 1] = xi;
j1 += m2;
k1 += 2 * m2;
xr = a[j1];
xi = a[j1 + 1];
yr = a[k1];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 1] = xi;
}
j1 = 2 * k + m2 + ip[k];
k1 = j1 + m2;
xr = a[j1];
xi = a[j1 + 1];
yr = a[k1];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 1] = xi;
}
} else {
for (k = 1; k < m; k++) {
for (j = 0; j < k; j++) {
j1 = 2 * j + ip[k];
k1 = 2 * k + ip[j];
xr = a[j1];
xi = a[j1 + 1];
yr = a[k1];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 1] = xi;
j1 += m2;
k1 += m2;
xr = a[j1];
xi = a[j1 + 1];
yr = a[k1];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 1] = xi;
}
}
}
}
void cftfsub(size_t n, float* a, float* w) {
size_t j, j1, j2, j3, l;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
l = 2;
if (n > 8) {
cft1st(n, a, w);
l = 8;
while ((l << 2) < n) {
cftmdl(n, l, a, w);
l <<= 2;
}
}
if ((l << 2) == n) {
for (j = 0; j < l; j += 2) {
j1 = j + l;
j2 = j1 + l;
j3 = j2 + l;
x0r = a[j] + a[j1];
x0i = a[j + 1] + a[j1 + 1];
x1r = a[j] - a[j1];
x1i = a[j + 1] - a[j1 + 1];
x2r = a[j2] + a[j3];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2] - a[j3];
x3i = a[j2 + 1] - a[j3 + 1];
a[j] = x0r + x2r;
a[j + 1] = x0i + x2i;
a[j2] = x0r - x2r;
a[j2 + 1] = x0i - x2i;
a[j1] = x1r - x3i;
a[j1 + 1] = x1i + x3r;
a[j3] = x1r + x3i;
a[j3 + 1] = x1i - x3r;
}
} else {
for (j = 0; j < l; j += 2) {
j1 = j + l;
x0r = a[j] - a[j1];
x0i = a[j + 1] - a[j1 + 1];
a[j] += a[j1];
a[j + 1] += a[j1 + 1];
a[j1] = x0r;
a[j1 + 1] = x0i;
}
}
}
void cftbsub(size_t n, float* a, float* w) {
size_t j, j1, j2, j3, l;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
l = 2;
if (n > 8) {
cft1st(n, a, w);
l = 8;
while ((l << 2) < n) {
cftmdl(n, l, a, w);
l <<= 2;
}
}
if ((l << 2) == n) {
for (j = 0; j < l; j += 2) {
j1 = j + l;
j2 = j1 + l;
j3 = j2 + l;
x0r = a[j] + a[j1];
x0i = -a[j + 1] - a[j1 + 1];
x1r = a[j] - a[j1];
x1i = -a[j + 1] + a[j1 + 1];
x2r = a[j2] + a[j3];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2] - a[j3];
x3i = a[j2 + 1] - a[j3 + 1];
a[j] = x0r + x2r;
a[j + 1] = x0i - x2i;
a[j2] = x0r - x2r;
a[j2 + 1] = x0i + x2i;
a[j1] = x1r - x3i;
a[j1 + 1] = x1i - x3r;
a[j3] = x1r + x3i;
a[j3 + 1] = x1i + x3r;
}
} else {
for (j = 0; j < l; j += 2) {
j1 = j + l;
x0r = a[j] - a[j1];
x0i = -a[j + 1] + a[j1 + 1];
a[j] += a[j1];
a[j + 1] = -a[j + 1] - a[j1 + 1];
a[j1] = x0r;
a[j1 + 1] = x0i;
}
}
}
void cft1st(size_t n, float* a, float* w) {
size_t j, k1, k2;
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
x0r = a[0] + a[2];
x0i = a[1] + a[3];
x1r = a[0] - a[2];
x1i = a[1] - a[3];
x2r = a[4] + a[6];
x2i = a[5] + a[7];
x3r = a[4] - a[6];
x3i = a[5] - a[7];
a[0] = x0r + x2r;
a[1] = x0i + x2i;
a[4] = x0r - x2r;
a[5] = x0i - x2i;
a[2] = x1r - x3i;
a[3] = x1i + x3r;
a[6] = x1r + x3i;
a[7] = x1i - x3r;
wk1r = w[2];
x0r = a[8] + a[10];
x0i = a[9] + a[11];
x1r = a[8] - a[10];
x1i = a[9] - a[11];
x2r = a[12] + a[14];
x2i = a[13] + a[15];
x3r = a[12] - a[14];
x3i = a[13] - a[15];
a[8] = x0r + x2r;
a[9] = x0i + x2i;
a[12] = x2i - x0i;
a[13] = x0r - x2r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[10] = wk1r * (x0r - x0i);
a[11] = wk1r * (x0r + x0i);
x0r = x3i + x1r;
x0i = x3r - x1i;
a[14] = wk1r * (x0i - x0r);
a[15] = wk1r * (x0i + x0r);
k1 = 0;
for (j = 16; j < n; j += 16) {
k1 += 2;
k2 = 2 * k1;
wk2r = w[k1];
wk2i = w[k1 + 1];
wk1r = w[k2];
wk1i = w[k2 + 1];
wk3r = wk1r - 2 * wk2i * wk1i;
wk3i = 2 * wk2i * wk1r - wk1i;
x0r = a[j] + a[j + 2];
x0i = a[j + 1] + a[j + 3];
x1r = a[j] - a[j + 2];
x1i = a[j + 1] - a[j + 3];
x2r = a[j + 4] + a[j + 6];
x2i = a[j + 5] + a[j + 7];
x3r = a[j + 4] - a[j + 6];
x3i = a[j + 5] - a[j + 7];
a[j] = x0r + x2r;
a[j + 1] = x0i + x2i;
x0r -= x2r;
x0i -= x2i;
a[j + 4] = wk2r * x0r - wk2i * x0i;
a[j + 5] = wk2r * x0i + wk2i * x0r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j + 2] = wk1r * x0r - wk1i * x0i;
a[j + 3] = wk1r * x0i + wk1i * x0r;
x0r = x1r + x3i;
x0i = x1i - x3r;
a[j + 6] = wk3r * x0r - wk3i * x0i;
a[j + 7] = wk3r * x0i + wk3i * x0r;
wk1r = w[k2 + 2];
wk1i = w[k2 + 3];
wk3r = wk1r - 2 * wk2r * wk1i;
wk3i = 2 * wk2r * wk1r - wk1i;
x0r = a[j + 8] + a[j + 10];
x0i = a[j + 9] + a[j + 11];
x1r = a[j + 8] - a[j + 10];
x1i = a[j + 9] - a[j + 11];
x2r = a[j + 12] + a[j + 14];
x2i = a[j + 13] + a[j + 15];
x3r = a[j + 12] - a[j + 14];
x3i = a[j + 13] - a[j + 15];
a[j + 8] = x0r + x2r;
a[j + 9] = x0i + x2i;
x0r -= x2r;
x0i -= x2i;
a[j + 12] = -wk2i * x0r - wk2r * x0i;
a[j + 13] = -wk2i * x0i + wk2r * x0r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j + 10] = wk1r * x0r - wk1i * x0i;
a[j + 11] = wk1r * x0i + wk1i * x0r;
x0r = x1r + x3i;
x0i = x1i - x3r;
a[j + 14] = wk3r * x0r - wk3i * x0i;
a[j + 15] = wk3r * x0i + wk3i * x0r;
}
}
void cftmdl(size_t n, size_t l, float* a, float* w) {
size_t j, j1, j2, j3, k, k1, k2, m, m2;
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
m = l << 2;
for (j = 0; j < l; j += 2) {
j1 = j + l;
j2 = j1 + l;
j3 = j2 + l;
x0r = a[j] + a[j1];
x0i = a[j + 1] + a[j1 + 1];
x1r = a[j] - a[j1];
x1i = a[j + 1] - a[j1 + 1];
x2r = a[j2] + a[j3];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2] - a[j3];
x3i = a[j2 + 1] - a[j3 + 1];
a[j] = x0r + x2r;
a[j + 1] = x0i + x2i;
a[j2] = x0r - x2r;
a[j2 + 1] = x0i - x2i;
a[j1] = x1r - x3i;
a[j1 + 1] = x1i + x3r;
a[j3] = x1r + x3i;
a[j3 + 1] = x1i - x3r;
}
wk1r = w[2];
for (j = m; j < l + m; j += 2) {
j1 = j + l;
j2 = j1 + l;
j3 = j2 + l;
x0r = a[j] + a[j1];
x0i = a[j + 1] + a[j1 + 1];
x1r = a[j] - a[j1];
x1i = a[j + 1] - a[j1 + 1];
x2r = a[j2] + a[j3];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2] - a[j3];
x3i = a[j2 + 1] - a[j3 + 1];
a[j] = x0r + x2r;
a[j + 1] = x0i + x2i;
a[j2] = x2i - x0i;
a[j2 + 1] = x0r - x2r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j1] = wk1r * (x0r - x0i);
a[j1 + 1] = wk1r * (x0r + x0i);
x0r = x3i + x1r;
x0i = x3r - x1i;
a[j3] = wk1r * (x0i - x0r);
a[j3 + 1] = wk1r * (x0i + x0r);
}
k1 = 0;
m2 = 2 * m;
for (k = m2; k < n; k += m2) {
k1 += 2;
k2 = 2 * k1;
wk2r = w[k1];
wk2i = w[k1 + 1];
wk1r = w[k2];
wk1i = w[k2 + 1];
wk3r = wk1r - 2 * wk2i * wk1i;
wk3i = 2 * wk2i * wk1r - wk1i;
for (j = k; j < l + k; j += 2) {
j1 = j + l;
j2 = j1 + l;
j3 = j2 + l;
x0r = a[j] + a[j1];
x0i = a[j + 1] + a[j1 + 1];
x1r = a[j] - a[j1];
x1i = a[j + 1] - a[j1 + 1];
x2r = a[j2] + a[j3];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2] - a[j3];
x3i = a[j2 + 1] - a[j3 + 1];
a[j] = x0r + x2r;
a[j + 1] = x0i + x2i;
x0r -= x2r;
x0i -= x2i;
a[j2] = wk2r * x0r - wk2i * x0i;
a[j2 + 1] = wk2r * x0i + wk2i * x0r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j1] = wk1r * x0r - wk1i * x0i;
a[j1 + 1] = wk1r * x0i + wk1i * x0r;
x0r = x1r + x3i;
x0i = x1i - x3r;
a[j3] = wk3r * x0r - wk3i * x0i;
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
}
wk1r = w[k2 + 2];
wk1i = w[k2 + 3];
wk3r = wk1r - 2 * wk2r * wk1i;
wk3i = 2 * wk2r * wk1r - wk1i;
for (j = k + m; j < l + (k + m); j += 2) {
j1 = j + l;
j2 = j1 + l;
j3 = j2 + l;
x0r = a[j] + a[j1];
x0i = a[j + 1] + a[j1 + 1];
x1r = a[j] - a[j1];
x1i = a[j + 1] - a[j1 + 1];
x2r = a[j2] + a[j3];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2] - a[j3];
x3i = a[j2 + 1] - a[j3 + 1];
a[j] = x0r + x2r;
a[j + 1] = x0i + x2i;
x0r -= x2r;
x0i -= x2i;
a[j2] = -wk2i * x0r - wk2r * x0i;
a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j1] = wk1r * x0r - wk1i * x0i;
a[j1 + 1] = wk1r * x0i + wk1i * x0r;
x0r = x1r + x3i;
x0i = x1i - x3r;
a[j3] = wk3r * x0r - wk3i * x0i;
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
}
}
}
void rftfsub(size_t n, float* a, size_t nc, float* c) {
size_t j, k, kk, ks, m;
float wkr, wki, xr, xi, yr, yi;
m = n >> 1;
ks = 2 * nc / m;
kk = 0;
for (j = 2; j < m; j += 2) {
k = n - j;
kk += ks;
wkr = 0.5f - c[nc - kk];
wki = c[kk];
xr = a[j] - a[k];
xi = a[j + 1] + a[k + 1];
yr = wkr * xr - wki * xi;
yi = wkr * xi + wki * xr;
a[j] -= yr;
a[j + 1] -= yi;
a[k] += yr;
a[k + 1] -= yi;
}
}
void rftbsub(size_t n, float* a, size_t nc, float* c) {
size_t j, k, kk, ks, m;
float wkr, wki, xr, xi, yr, yi;
a[1] = -a[1];
m = n >> 1;
ks = 2 * nc / m;
kk = 0;
for (j = 2; j < m; j += 2) {
k = n - j;
kk += ks;
wkr = 0.5f - c[nc - kk];
wki = c[kk];
xr = a[j] - a[k];
xi = a[j + 1] + a[k + 1];
yr = wkr * xr + wki * xi;
yi = wkr * xi - wki * xr;
a[j] -= yr;
a[j + 1] = yi - a[j + 1];
a[k] += yr;
a[k + 1] = yi - a[k + 1];
}
a[m + 1] = -a[m + 1];
}
} // namespace
void WebRtc_rdft(size_t n, int isgn, float* a, size_t* ip, float* w) {
size_t nw, nc;
float xi;
nw = ip[0];
if (n > (nw << 2)) {
nw = n >> 2;
makewt(nw, ip, w);
}
nc = ip[1];
if (n > (nc << 2)) {
nc = n >> 2;
makect(nc, ip, w + nw);
}
if (isgn >= 0) {
if (n > 4) {
bitrv2(n, ip + 2, a);
cftfsub(n, a, w);
rftfsub(n, a, nc, w + nw);
} else if (n == 4) {
cftfsub(n, a, w);
}
xi = a[0] - a[1];
a[0] += a[1];
a[1] = xi;
} else {
a[1] = 0.5f * (a[0] - a[1]);
a[0] -= a[1];
if (n > 4) {
rftbsub(n, a, nc, w + nw);
bitrv2(n, ip + 2, a);
cftbsub(n, a, w);
} else if (n == 4) {
cftfsub(n, a, w);
}
}
}
} // namespace webrtc

View File

@ -0,0 +1,21 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the ../../../LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef COMMON_AUDIO_THIRD_PARTY_OOURA_FFT_SIZE_256_FFT4G_H_
#define COMMON_AUDIO_THIRD_PARTY_OOURA_FFT_SIZE_256_FFT4G_H_
namespace webrtc {
// Refer to fft4g.c for documentation.
void WebRtc_rdft(size_t n, int isgn, float* a, size_t* ip, float* w);
} // namespace webrtc
#endif // COMMON_AUDIO_THIRD_PARTY_OOURA_FFT_SIZE_256_FFT4G_H_

View File

@ -0,0 +1,24 @@
# Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
#
# Use of this source code is governed by a BSD-style license
# that can be found in the ../../../LICENSE file in the root of the source
# tree. An additional intellectual property rights grant can be found
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
import("../../../webrtc.gni")
rtc_library("spl_sqrt_floor") {
visibility = [ "../..:common_audio_c" ]
sources = [ "spl_sqrt_floor.h" ]
deps = []
if (current_cpu == "arm") {
sources += [ "spl_sqrt_floor_arm.S" ]
deps += [ "../../../rtc_base/system:asm_defines" ]
} else if (current_cpu == "mipsel") {
sources += [ "spl_sqrt_floor_mips.c" ]
} else {
sources += [ "spl_sqrt_floor.c" ]
}
}

View File

@ -0,0 +1,27 @@
/*
* Written by Wilco Dijkstra, 1996. The following email exchange establishes the
* license.
*
* From: Wilco Dijkstra <Wilco.Dijkstra@ntlworld.com>
* Date: Fri, Jun 24, 2011 at 3:20 AM
* Subject: Re: sqrt routine
* To: Kevin Ma <kma@google.com>
* Hi Kevin,
* Thanks for asking. Those routines are public domain (originally posted to
* comp.sys.arm a long time ago), so you can use them freely for any purpose.
* Cheers,
* Wilco
*
* ----- Original Message -----
* From: "Kevin Ma" <kma@google.com>
* To: <Wilco.Dijkstra@ntlworld.com>
* Sent: Thursday, June 23, 2011 11:44 PM
* Subject: Fwd: sqrt routine
* Hi Wilco,
* I saw your sqrt routine from several web sites, including
* http://www.finesse.demon.co.uk/steven/sqrt.html.
* Just wonder if there's any copyright information with your Successive
* approximation routines, or if I can freely use it for any purpose.
* Thanks.
* Kevin
*/

View File

@ -0,0 +1,77 @@
/*
* Written by Wilco Dijkstra, 1996. The following email exchange establishes the
* license.
*
* From: Wilco Dijkstra <Wilco.Dijkstra@ntlworld.com>
* Date: Fri, Jun 24, 2011 at 3:20 AM
* Subject: Re: sqrt routine
* To: Kevin Ma <kma@google.com>
* Hi Kevin,
* Thanks for asking. Those routines are public domain (originally posted to
* comp.sys.arm a long time ago), so you can use them freely for any purpose.
* Cheers,
* Wilco
*
* ----- Original Message -----
* From: "Kevin Ma" <kma@google.com>
* To: <Wilco.Dijkstra@ntlworld.com>
* Sent: Thursday, June 23, 2011 11:44 PM
* Subject: Fwd: sqrt routine
* Hi Wilco,
* I saw your sqrt routine from several web sites, including
* http://www.finesse.demon.co.uk/steven/sqrt.html.
* Just wonder if there's any copyright information with your Successive
* approximation routines, or if I can freely use it for any purpose.
* Thanks.
* Kevin
*/
// Minor modifications in code style for WebRTC, 2012.
#include "common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h"
/*
* Algorithm:
* Successive approximation of the equation (root + delta) ^ 2 = N
* until delta < 1. If delta < 1 we have the integer part of SQRT (N).
* Use delta = 2^i for i = 15 .. 0.
*
* Output precision is 16 bits. Note for large input values (close to
* 0x7FFFFFFF), bit 15 (the highest bit of the low 16-bit half word)
* contains the MSB information (a non-sign value). Do with caution
* if you need to cast the output to int16_t type.
*
* If the input value is negative, it returns 0.
*/
#define WEBRTC_SPL_SQRT_ITER(N) \
try1 = root + (1 << (N)); \
if (value >= try1 << (N)) \
{ \
value -= try1 << (N); \
root |= 2 << (N); \
}
int32_t WebRtcSpl_SqrtFloor(int32_t value)
{
int32_t root = 0, try1;
WEBRTC_SPL_SQRT_ITER (15);
WEBRTC_SPL_SQRT_ITER (14);
WEBRTC_SPL_SQRT_ITER (13);
WEBRTC_SPL_SQRT_ITER (12);
WEBRTC_SPL_SQRT_ITER (11);
WEBRTC_SPL_SQRT_ITER (10);
WEBRTC_SPL_SQRT_ITER ( 9);
WEBRTC_SPL_SQRT_ITER ( 8);
WEBRTC_SPL_SQRT_ITER ( 7);
WEBRTC_SPL_SQRT_ITER ( 6);
WEBRTC_SPL_SQRT_ITER ( 5);
WEBRTC_SPL_SQRT_ITER ( 4);
WEBRTC_SPL_SQRT_ITER ( 3);
WEBRTC_SPL_SQRT_ITER ( 2);
WEBRTC_SPL_SQRT_ITER ( 1);
WEBRTC_SPL_SQRT_ITER ( 0);
return root >> 1;
}

View File

@ -0,0 +1,29 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdint.h>
//
// WebRtcSpl_SqrtFloor(...)
//
// Returns the square root of the input value |value|. The precision of this
// function is rounding down integer precision, i.e., sqrt(8) gives 2 as answer.
// If |value| is a negative number then 0 is returned.
//
// Algorithm:
//
// An iterative 4 cylce/bit routine
//
// Input:
// - value : Value to calculate sqrt of
//
// Return value : Result of the sqrt calculation
//
int32_t WebRtcSpl_SqrtFloor(int32_t value);

View File

@ -0,0 +1,110 @@
@
@ Written by Wilco Dijkstra, 1996. The following email exchange establishes the
@ license.
@
@ From: Wilco Dijkstra <Wilco.Dijkstra@ntlworld.com>
@ Date: Fri, Jun 24, 2011 at 3:20 AM
@ Subject: Re: sqrt routine
@ To: Kevin Ma <kma@google.com>
@ Hi Kevin,
@ Thanks for asking. Those routines are public domain (originally posted to
@ comp.sys.arm a long time ago), so you can use them freely for any purpose.
@ Cheers,
@ Wilco
@
@ ----- Original Message -----
@ From: "Kevin Ma" <kma@google.com>
@ To: <Wilco.Dijkstra@ntlworld.com>
@ Sent: Thursday, June 23, 2011 11:44 PM
@ Subject: Fwd: sqrt routine
@ Hi Wilco,
@ I saw your sqrt routine from several web sites, including
@ http://www.finesse.demon.co.uk/steven/sqrt.html.
@ Just wonder if there's any copyright information with your Successive
@ approximation routines, or if I can freely use it for any purpose.
@ Thanks.
@ Kevin
@ Minor modifications in code style for WebRTC, 2012.
@ Output is bit-exact with the reference C code in spl_sqrt_floor.c.
@ Input : r0 32 bit unsigned integer
@ Output: r0 = INT (SQRT (r0)), precision is 16 bits
@ Registers touched: r1, r2
#include "rtc_base/system/asm_defines.h"
GLOBAL_FUNCTION WebRtcSpl_SqrtFloor
.align 2
DEFINE_FUNCTION WebRtcSpl_SqrtFloor
mov r1, #3 << 30
mov r2, #1 << 30
@ unroll for i = 0 .. 15
cmp r0, r2, ror #2 * 0
subhs r0, r0, r2, ror #2 * 0
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 1
subhs r0, r0, r2, ror #2 * 1
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 2
subhs r0, r0, r2, ror #2 * 2
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 3
subhs r0, r0, r2, ror #2 * 3
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 4
subhs r0, r0, r2, ror #2 * 4
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 5
subhs r0, r0, r2, ror #2 * 5
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 6
subhs r0, r0, r2, ror #2 * 6
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 7
subhs r0, r0, r2, ror #2 * 7
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 8
subhs r0, r0, r2, ror #2 * 8
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 9
subhs r0, r0, r2, ror #2 * 9
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 10
subhs r0, r0, r2, ror #2 * 10
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 11
subhs r0, r0, r2, ror #2 * 11
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 12
subhs r0, r0, r2, ror #2 * 12
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 13
subhs r0, r0, r2, ror #2 * 13
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 14
subhs r0, r0, r2, ror #2 * 14
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 15
subhs r0, r0, r2, ror #2 * 15
adc r2, r1, r2, lsl #1
bic r0, r2, #3 << 30 @ for rounding add: cmp r0, r2 adc r2, #1
bx lr

View File

@ -0,0 +1,207 @@
/*
* Written by Wilco Dijkstra, 1996. The following email exchange establishes the
* license.
*
* From: Wilco Dijkstra <Wilco.Dijkstra@ntlworld.com>
* Date: Fri, Jun 24, 2011 at 3:20 AM
* Subject: Re: sqrt routine
* To: Kevin Ma <kma@google.com>
* Hi Kevin,
* Thanks for asking. Those routines are public domain (originally posted to
* comp.sys.arm a long time ago), so you can use them freely for any purpose.
* Cheers,
* Wilco
*
* ----- Original Message -----
* From: "Kevin Ma" <kma@google.com>
* To: <Wilco.Dijkstra@ntlworld.com>
* Sent: Thursday, June 23, 2011 11:44 PM
* Subject: Fwd: sqrt routine
* Hi Wilco,
* I saw your sqrt routine from several web sites, including
* http://www.finesse.demon.co.uk/steven/sqrt.html.
* Just wonder if there's any copyright information with your Successive
* approximation routines, or if I can freely use it for any purpose.
* Thanks.
* Kevin
*/
// Minor modifications in code style for WebRTC, 2012.
// Code optimizations for MIPS, 2013.
#include "common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h"
/*
* Algorithm:
* Successive approximation of the equation (root + delta) ^ 2 = N
* until delta < 1. If delta < 1 we have the integer part of SQRT (N).
* Use delta = 2^i for i = 15 .. 0.
*
* Output precision is 16 bits. Note for large input values (close to
* 0x7FFFFFFF), bit 15 (the highest bit of the low 16-bit half word)
* contains the MSB information (a non-sign value). Do with caution
* if you need to cast the output to int16_t type.
*
* If the input value is negative, it returns 0.
*/
int32_t WebRtcSpl_SqrtFloor(int32_t value)
{
int32_t root = 0, tmp1, tmp2, tmp3, tmp4;
__asm __volatile(
".set push \n\t"
".set noreorder \n\t"
"lui %[tmp1], 0x4000 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"sub %[tmp3], %[value], %[tmp1] \n\t"
"lui %[tmp1], 0x1 \n\t"
"or %[tmp4], %[root], %[tmp1] \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x4000 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 14 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x8000 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x2000 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 13 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x4000 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x1000 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 12 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x2000 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x800 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 11 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x1000 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x400 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 10 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x800 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x200 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 9 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x400 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x100 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 8 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x200 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x80 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 7 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x100 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x40 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 6 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x80 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x20 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 5 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x40 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x10 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 4 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x20 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x8 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 3 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x10 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x4 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 2 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x8 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x2 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 1 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x4 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x1 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x2 \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
".set pop \n\t"
: [root] "+r" (root), [value] "+r" (value),
[tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
[tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4)
:
);
return root >> 1;
}