Update to current webrtc library
This is from the upstream library commit id 3326535126e435f1ba647885ce43a8f0f3d317eb, corresponding to Chromium 88.0.4290.1.
This commit is contained in:
58
webrtc/common_audio/third_party/ooura/BUILD.gn
vendored
Normal file
58
webrtc/common_audio/third_party/ooura/BUILD.gn
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
# Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
|
||||
#
|
||||
# Use of this source code is governed by a BSD-style license
|
||||
# that can be found in the ../../../LICENSE file in the root of the source
|
||||
# tree. An additional intellectual property rights grant can be found
|
||||
# in the file PATENTS. All contributing project authors may
|
||||
# be found in the AUTHORS file in the root of the source tree.
|
||||
|
||||
import("../../../webrtc.gni")
|
||||
|
||||
rtc_library("fft_size_128") {
|
||||
sources = [
|
||||
"fft_size_128/ooura_fft.cc",
|
||||
"fft_size_128/ooura_fft.h",
|
||||
"fft_size_128/ooura_fft_tables_common.h",
|
||||
]
|
||||
deps = [
|
||||
"../../../rtc_base/system:arch",
|
||||
"../../../system_wrappers",
|
||||
]
|
||||
cflags = []
|
||||
|
||||
if (current_cpu == "x86" || current_cpu == "x64") {
|
||||
sources += [
|
||||
"fft_size_128/ooura_fft_sse2.cc",
|
||||
"fft_size_128/ooura_fft_tables_neon_sse2.h",
|
||||
]
|
||||
if (is_posix || is_fuchsia) {
|
||||
cflags += [ "-msse2" ]
|
||||
}
|
||||
}
|
||||
|
||||
if (rtc_build_with_neon) {
|
||||
sources += [
|
||||
"fft_size_128/ooura_fft_neon.cc",
|
||||
"fft_size_128/ooura_fft_tables_neon_sse2.h",
|
||||
]
|
||||
|
||||
deps += [ "../../../common_audio" ]
|
||||
|
||||
if (current_cpu != "arm64") {
|
||||
# Enable compilation for the NEON instruction set.
|
||||
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
|
||||
cflags += [ "-mfpu=neon" ]
|
||||
}
|
||||
}
|
||||
|
||||
if (current_cpu == "mipsel" && mips_float_abi == "hard") {
|
||||
sources += [ "fft_size_128/ooura_fft_mips.cc" ]
|
||||
}
|
||||
}
|
||||
|
||||
rtc_library("fft_size_256") {
|
||||
sources = [
|
||||
"fft_size_256/fft4g.cc",
|
||||
"fft_size_256/fft4g.h",
|
||||
]
|
||||
}
|
8
webrtc/common_audio/third_party/ooura/LICENSE
vendored
Normal file
8
webrtc/common_audio/third_party/ooura/LICENSE
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
/*
|
||||
* http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
|
||||
* Copyright Takuya OOURA, 1996-2001
|
||||
*
|
||||
* You may use, copy, modify and distribute this code for any purpose (include
|
||||
* commercial use) and without fee. Please refer to this package when you modify
|
||||
* this code.
|
||||
*/
|
548
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft.cc
vendored
Normal file
548
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft.cc
vendored
Normal file
@ -0,0 +1,548 @@
|
||||
/*
|
||||
* http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
|
||||
* Copyright Takuya OOURA, 1996-2001
|
||||
*
|
||||
* You may use, copy, modify and distribute this code for any purpose (include
|
||||
* commercial use) and without fee. Please refer to this package when you modify
|
||||
* this code.
|
||||
*
|
||||
* Changes by the WebRTC authors:
|
||||
* - Trivial type modifications.
|
||||
* - Minimal code subset to do rdft of length 128.
|
||||
* - Optimizations because of known length.
|
||||
* - Removed the global variables by moving the code in to a class in order
|
||||
* to make it thread safe.
|
||||
*
|
||||
* All changes are covered by the WebRTC license and IP grant:
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
|
||||
|
||||
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h"
|
||||
#include "rtc_base/system/arch.h"
|
||||
#include "system_wrappers/include/cpu_features_wrapper.h"
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
namespace {
|
||||
|
||||
#if !(defined(MIPS_FPU_LE) || defined(WEBRTC_HAS_NEON))
|
||||
static void cft1st_128_C(float* a) {
|
||||
const int n = 128;
|
||||
int j, k1, k2;
|
||||
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
|
||||
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
||||
|
||||
// The processing of the first set of elements was simplified in C to avoid
|
||||
// some operations (multiplication by zero or one, addition of two elements
|
||||
// multiplied by the same weight, ...).
|
||||
x0r = a[0] + a[2];
|
||||
x0i = a[1] + a[3];
|
||||
x1r = a[0] - a[2];
|
||||
x1i = a[1] - a[3];
|
||||
x2r = a[4] + a[6];
|
||||
x2i = a[5] + a[7];
|
||||
x3r = a[4] - a[6];
|
||||
x3i = a[5] - a[7];
|
||||
a[0] = x0r + x2r;
|
||||
a[1] = x0i + x2i;
|
||||
a[4] = x0r - x2r;
|
||||
a[5] = x0i - x2i;
|
||||
a[2] = x1r - x3i;
|
||||
a[3] = x1i + x3r;
|
||||
a[6] = x1r + x3i;
|
||||
a[7] = x1i - x3r;
|
||||
wk1r = rdft_w[2];
|
||||
x0r = a[8] + a[10];
|
||||
x0i = a[9] + a[11];
|
||||
x1r = a[8] - a[10];
|
||||
x1i = a[9] - a[11];
|
||||
x2r = a[12] + a[14];
|
||||
x2i = a[13] + a[15];
|
||||
x3r = a[12] - a[14];
|
||||
x3i = a[13] - a[15];
|
||||
a[8] = x0r + x2r;
|
||||
a[9] = x0i + x2i;
|
||||
a[12] = x2i - x0i;
|
||||
a[13] = x0r - x2r;
|
||||
x0r = x1r - x3i;
|
||||
x0i = x1i + x3r;
|
||||
a[10] = wk1r * (x0r - x0i);
|
||||
a[11] = wk1r * (x0r + x0i);
|
||||
x0r = x3i + x1r;
|
||||
x0i = x3r - x1i;
|
||||
a[14] = wk1r * (x0i - x0r);
|
||||
a[15] = wk1r * (x0i + x0r);
|
||||
k1 = 0;
|
||||
for (j = 16; j < n; j += 16) {
|
||||
k1 += 2;
|
||||
k2 = 2 * k1;
|
||||
wk2r = rdft_w[k1 + 0];
|
||||
wk2i = rdft_w[k1 + 1];
|
||||
wk1r = rdft_w[k2 + 0];
|
||||
wk1i = rdft_w[k2 + 1];
|
||||
wk3r = rdft_wk3ri_first[k1 + 0];
|
||||
wk3i = rdft_wk3ri_first[k1 + 1];
|
||||
x0r = a[j + 0] + a[j + 2];
|
||||
x0i = a[j + 1] + a[j + 3];
|
||||
x1r = a[j + 0] - a[j + 2];
|
||||
x1i = a[j + 1] - a[j + 3];
|
||||
x2r = a[j + 4] + a[j + 6];
|
||||
x2i = a[j + 5] + a[j + 7];
|
||||
x3r = a[j + 4] - a[j + 6];
|
||||
x3i = a[j + 5] - a[j + 7];
|
||||
a[j + 0] = x0r + x2r;
|
||||
a[j + 1] = x0i + x2i;
|
||||
x0r -= x2r;
|
||||
x0i -= x2i;
|
||||
a[j + 4] = wk2r * x0r - wk2i * x0i;
|
||||
a[j + 5] = wk2r * x0i + wk2i * x0r;
|
||||
x0r = x1r - x3i;
|
||||
x0i = x1i + x3r;
|
||||
a[j + 2] = wk1r * x0r - wk1i * x0i;
|
||||
a[j + 3] = wk1r * x0i + wk1i * x0r;
|
||||
x0r = x1r + x3i;
|
||||
x0i = x1i - x3r;
|
||||
a[j + 6] = wk3r * x0r - wk3i * x0i;
|
||||
a[j + 7] = wk3r * x0i + wk3i * x0r;
|
||||
wk1r = rdft_w[k2 + 2];
|
||||
wk1i = rdft_w[k2 + 3];
|
||||
wk3r = rdft_wk3ri_second[k1 + 0];
|
||||
wk3i = rdft_wk3ri_second[k1 + 1];
|
||||
x0r = a[j + 8] + a[j + 10];
|
||||
x0i = a[j + 9] + a[j + 11];
|
||||
x1r = a[j + 8] - a[j + 10];
|
||||
x1i = a[j + 9] - a[j + 11];
|
||||
x2r = a[j + 12] + a[j + 14];
|
||||
x2i = a[j + 13] + a[j + 15];
|
||||
x3r = a[j + 12] - a[j + 14];
|
||||
x3i = a[j + 13] - a[j + 15];
|
||||
a[j + 8] = x0r + x2r;
|
||||
a[j + 9] = x0i + x2i;
|
||||
x0r -= x2r;
|
||||
x0i -= x2i;
|
||||
a[j + 12] = -wk2i * x0r - wk2r * x0i;
|
||||
a[j + 13] = -wk2i * x0i + wk2r * x0r;
|
||||
x0r = x1r - x3i;
|
||||
x0i = x1i + x3r;
|
||||
a[j + 10] = wk1r * x0r - wk1i * x0i;
|
||||
a[j + 11] = wk1r * x0i + wk1i * x0r;
|
||||
x0r = x1r + x3i;
|
||||
x0i = x1i - x3r;
|
||||
a[j + 14] = wk3r * x0r - wk3i * x0i;
|
||||
a[j + 15] = wk3r * x0i + wk3i * x0r;
|
||||
}
|
||||
}
|
||||
|
||||
static void cftmdl_128_C(float* a) {
|
||||
const int l = 8;
|
||||
const int n = 128;
|
||||
const int m = 32;
|
||||
int j0, j1, j2, j3, k, k1, k2, m2;
|
||||
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
|
||||
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
||||
|
||||
for (j0 = 0; j0 < l; j0 += 2) {
|
||||
j1 = j0 + 8;
|
||||
j2 = j0 + 16;
|
||||
j3 = j0 + 24;
|
||||
x0r = a[j0 + 0] + a[j1 + 0];
|
||||
x0i = a[j0 + 1] + a[j1 + 1];
|
||||
x1r = a[j0 + 0] - a[j1 + 0];
|
||||
x1i = a[j0 + 1] - a[j1 + 1];
|
||||
x2r = a[j2 + 0] + a[j3 + 0];
|
||||
x2i = a[j2 + 1] + a[j3 + 1];
|
||||
x3r = a[j2 + 0] - a[j3 + 0];
|
||||
x3i = a[j2 + 1] - a[j3 + 1];
|
||||
a[j0 + 0] = x0r + x2r;
|
||||
a[j0 + 1] = x0i + x2i;
|
||||
a[j2 + 0] = x0r - x2r;
|
||||
a[j2 + 1] = x0i - x2i;
|
||||
a[j1 + 0] = x1r - x3i;
|
||||
a[j1 + 1] = x1i + x3r;
|
||||
a[j3 + 0] = x1r + x3i;
|
||||
a[j3 + 1] = x1i - x3r;
|
||||
}
|
||||
wk1r = rdft_w[2];
|
||||
for (j0 = m; j0 < l + m; j0 += 2) {
|
||||
j1 = j0 + 8;
|
||||
j2 = j0 + 16;
|
||||
j3 = j0 + 24;
|
||||
x0r = a[j0 + 0] + a[j1 + 0];
|
||||
x0i = a[j0 + 1] + a[j1 + 1];
|
||||
x1r = a[j0 + 0] - a[j1 + 0];
|
||||
x1i = a[j0 + 1] - a[j1 + 1];
|
||||
x2r = a[j2 + 0] + a[j3 + 0];
|
||||
x2i = a[j2 + 1] + a[j3 + 1];
|
||||
x3r = a[j2 + 0] - a[j3 + 0];
|
||||
x3i = a[j2 + 1] - a[j3 + 1];
|
||||
a[j0 + 0] = x0r + x2r;
|
||||
a[j0 + 1] = x0i + x2i;
|
||||
a[j2 + 0] = x2i - x0i;
|
||||
a[j2 + 1] = x0r - x2r;
|
||||
x0r = x1r - x3i;
|
||||
x0i = x1i + x3r;
|
||||
a[j1 + 0] = wk1r * (x0r - x0i);
|
||||
a[j1 + 1] = wk1r * (x0r + x0i);
|
||||
x0r = x3i + x1r;
|
||||
x0i = x3r - x1i;
|
||||
a[j3 + 0] = wk1r * (x0i - x0r);
|
||||
a[j3 + 1] = wk1r * (x0i + x0r);
|
||||
}
|
||||
k1 = 0;
|
||||
m2 = 2 * m;
|
||||
for (k = m2; k < n; k += m2) {
|
||||
k1 += 2;
|
||||
k2 = 2 * k1;
|
||||
wk2r = rdft_w[k1 + 0];
|
||||
wk2i = rdft_w[k1 + 1];
|
||||
wk1r = rdft_w[k2 + 0];
|
||||
wk1i = rdft_w[k2 + 1];
|
||||
wk3r = rdft_wk3ri_first[k1 + 0];
|
||||
wk3i = rdft_wk3ri_first[k1 + 1];
|
||||
for (j0 = k; j0 < l + k; j0 += 2) {
|
||||
j1 = j0 + 8;
|
||||
j2 = j0 + 16;
|
||||
j3 = j0 + 24;
|
||||
x0r = a[j0 + 0] + a[j1 + 0];
|
||||
x0i = a[j0 + 1] + a[j1 + 1];
|
||||
x1r = a[j0 + 0] - a[j1 + 0];
|
||||
x1i = a[j0 + 1] - a[j1 + 1];
|
||||
x2r = a[j2 + 0] + a[j3 + 0];
|
||||
x2i = a[j2 + 1] + a[j3 + 1];
|
||||
x3r = a[j2 + 0] - a[j3 + 0];
|
||||
x3i = a[j2 + 1] - a[j3 + 1];
|
||||
a[j0 + 0] = x0r + x2r;
|
||||
a[j0 + 1] = x0i + x2i;
|
||||
x0r -= x2r;
|
||||
x0i -= x2i;
|
||||
a[j2 + 0] = wk2r * x0r - wk2i * x0i;
|
||||
a[j2 + 1] = wk2r * x0i + wk2i * x0r;
|
||||
x0r = x1r - x3i;
|
||||
x0i = x1i + x3r;
|
||||
a[j1 + 0] = wk1r * x0r - wk1i * x0i;
|
||||
a[j1 + 1] = wk1r * x0i + wk1i * x0r;
|
||||
x0r = x1r + x3i;
|
||||
x0i = x1i - x3r;
|
||||
a[j3 + 0] = wk3r * x0r - wk3i * x0i;
|
||||
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
|
||||
}
|
||||
wk1r = rdft_w[k2 + 2];
|
||||
wk1i = rdft_w[k2 + 3];
|
||||
wk3r = rdft_wk3ri_second[k1 + 0];
|
||||
wk3i = rdft_wk3ri_second[k1 + 1];
|
||||
for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
|
||||
j1 = j0 + 8;
|
||||
j2 = j0 + 16;
|
||||
j3 = j0 + 24;
|
||||
x0r = a[j0 + 0] + a[j1 + 0];
|
||||
x0i = a[j0 + 1] + a[j1 + 1];
|
||||
x1r = a[j0 + 0] - a[j1 + 0];
|
||||
x1i = a[j0 + 1] - a[j1 + 1];
|
||||
x2r = a[j2 + 0] + a[j3 + 0];
|
||||
x2i = a[j2 + 1] + a[j3 + 1];
|
||||
x3r = a[j2 + 0] - a[j3 + 0];
|
||||
x3i = a[j2 + 1] - a[j3 + 1];
|
||||
a[j0 + 0] = x0r + x2r;
|
||||
a[j0 + 1] = x0i + x2i;
|
||||
x0r -= x2r;
|
||||
x0i -= x2i;
|
||||
a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
|
||||
a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
|
||||
x0r = x1r - x3i;
|
||||
x0i = x1i + x3r;
|
||||
a[j1 + 0] = wk1r * x0r - wk1i * x0i;
|
||||
a[j1 + 1] = wk1r * x0i + wk1i * x0r;
|
||||
x0r = x1r + x3i;
|
||||
x0i = x1i - x3r;
|
||||
a[j3 + 0] = wk3r * x0r - wk3i * x0i;
|
||||
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void rftfsub_128_C(float* a) {
|
||||
const float* c = rdft_w + 32;
|
||||
int j1, j2, k1, k2;
|
||||
float wkr, wki, xr, xi, yr, yi;
|
||||
|
||||
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
|
||||
k2 = 128 - j2;
|
||||
k1 = 32 - j1;
|
||||
wkr = 0.5f - c[k1];
|
||||
wki = c[j1];
|
||||
xr = a[j2 + 0] - a[k2 + 0];
|
||||
xi = a[j2 + 1] + a[k2 + 1];
|
||||
yr = wkr * xr - wki * xi;
|
||||
yi = wkr * xi + wki * xr;
|
||||
a[j2 + 0] -= yr;
|
||||
a[j2 + 1] -= yi;
|
||||
a[k2 + 0] += yr;
|
||||
a[k2 + 1] -= yi;
|
||||
}
|
||||
}
|
||||
|
||||
static void rftbsub_128_C(float* a) {
|
||||
const float* c = rdft_w + 32;
|
||||
int j1, j2, k1, k2;
|
||||
float wkr, wki, xr, xi, yr, yi;
|
||||
|
||||
a[1] = -a[1];
|
||||
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
|
||||
k2 = 128 - j2;
|
||||
k1 = 32 - j1;
|
||||
wkr = 0.5f - c[k1];
|
||||
wki = c[j1];
|
||||
xr = a[j2 + 0] - a[k2 + 0];
|
||||
xi = a[j2 + 1] + a[k2 + 1];
|
||||
yr = wkr * xr + wki * xi;
|
||||
yi = wkr * xi - wki * xr;
|
||||
a[j2 + 0] = a[j2 + 0] - yr;
|
||||
a[j2 + 1] = yi - a[j2 + 1];
|
||||
a[k2 + 0] = yr + a[k2 + 0];
|
||||
a[k2 + 1] = yi - a[k2 + 1];
|
||||
}
|
||||
a[65] = -a[65];
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
OouraFft::OouraFft(bool sse2_available) {
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
use_sse2_ = sse2_available;
|
||||
#else
|
||||
use_sse2_ = false;
|
||||
#endif
|
||||
}
|
||||
|
||||
OouraFft::OouraFft() {
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
use_sse2_ = (GetCPUInfo(kSSE2) != 0);
|
||||
#else
|
||||
use_sse2_ = false;
|
||||
#endif
|
||||
}
|
||||
|
||||
OouraFft::~OouraFft() = default;
|
||||
|
||||
void OouraFft::Fft(float* a) const {
|
||||
float xi;
|
||||
bitrv2_128(a);
|
||||
cftfsub_128(a);
|
||||
rftfsub_128(a);
|
||||
xi = a[0] - a[1];
|
||||
a[0] += a[1];
|
||||
a[1] = xi;
|
||||
}
|
||||
void OouraFft::InverseFft(float* a) const {
|
||||
a[1] = 0.5f * (a[0] - a[1]);
|
||||
a[0] -= a[1];
|
||||
rftbsub_128(a);
|
||||
bitrv2_128(a);
|
||||
cftbsub_128(a);
|
||||
}
|
||||
|
||||
void OouraFft::cft1st_128(float* a) const {
|
||||
#if defined(MIPS_FPU_LE)
|
||||
cft1st_128_mips(a);
|
||||
#elif defined(WEBRTC_HAS_NEON)
|
||||
cft1st_128_neon(a);
|
||||
#elif defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
if (use_sse2_) {
|
||||
cft1st_128_SSE2(a);
|
||||
} else {
|
||||
cft1st_128_C(a);
|
||||
}
|
||||
#else
|
||||
cft1st_128_C(a);
|
||||
#endif
|
||||
}
|
||||
void OouraFft::cftmdl_128(float* a) const {
|
||||
#if defined(MIPS_FPU_LE)
|
||||
cftmdl_128_mips(a);
|
||||
#elif defined(WEBRTC_HAS_NEON)
|
||||
cftmdl_128_neon(a);
|
||||
#elif defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
if (use_sse2_) {
|
||||
cftmdl_128_SSE2(a);
|
||||
} else {
|
||||
cftmdl_128_C(a);
|
||||
}
|
||||
#else
|
||||
cftmdl_128_C(a);
|
||||
#endif
|
||||
}
|
||||
void OouraFft::rftfsub_128(float* a) const {
|
||||
#if defined(MIPS_FPU_LE)
|
||||
rftfsub_128_mips(a);
|
||||
#elif defined(WEBRTC_HAS_NEON)
|
||||
rftfsub_128_neon(a);
|
||||
#elif defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
if (use_sse2_) {
|
||||
rftfsub_128_SSE2(a);
|
||||
} else {
|
||||
rftfsub_128_C(a);
|
||||
}
|
||||
#else
|
||||
rftfsub_128_C(a);
|
||||
#endif
|
||||
}
|
||||
|
||||
void OouraFft::rftbsub_128(float* a) const {
|
||||
#if defined(MIPS_FPU_LE)
|
||||
rftbsub_128_mips(a);
|
||||
#elif defined(WEBRTC_HAS_NEON)
|
||||
rftbsub_128_neon(a);
|
||||
#elif defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
if (use_sse2_) {
|
||||
rftbsub_128_SSE2(a);
|
||||
} else {
|
||||
rftbsub_128_C(a);
|
||||
}
|
||||
#else
|
||||
rftbsub_128_C(a);
|
||||
#endif
|
||||
}
|
||||
|
||||
void OouraFft::cftbsub_128(float* a) const {
|
||||
int j, j1, j2, j3, l;
|
||||
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
||||
|
||||
cft1st_128(a);
|
||||
cftmdl_128(a);
|
||||
l = 32;
|
||||
|
||||
for (j = 0; j < l; j += 2) {
|
||||
j1 = j + l;
|
||||
j2 = j1 + l;
|
||||
j3 = j2 + l;
|
||||
x0r = a[j] + a[j1];
|
||||
x0i = -a[j + 1] - a[j1 + 1];
|
||||
x1r = a[j] - a[j1];
|
||||
x1i = -a[j + 1] + a[j1 + 1];
|
||||
x2r = a[j2] + a[j3];
|
||||
x2i = a[j2 + 1] + a[j3 + 1];
|
||||
x3r = a[j2] - a[j3];
|
||||
x3i = a[j2 + 1] - a[j3 + 1];
|
||||
a[j] = x0r + x2r;
|
||||
a[j + 1] = x0i - x2i;
|
||||
a[j2] = x0r - x2r;
|
||||
a[j2 + 1] = x0i + x2i;
|
||||
a[j1] = x1r - x3i;
|
||||
a[j1 + 1] = x1i - x3r;
|
||||
a[j3] = x1r + x3i;
|
||||
a[j3 + 1] = x1i + x3r;
|
||||
}
|
||||
}
|
||||
|
||||
void OouraFft::cftfsub_128(float* a) const {
|
||||
int j, j1, j2, j3, l;
|
||||
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
||||
|
||||
cft1st_128(a);
|
||||
cftmdl_128(a);
|
||||
l = 32;
|
||||
for (j = 0; j < l; j += 2) {
|
||||
j1 = j + l;
|
||||
j2 = j1 + l;
|
||||
j3 = j2 + l;
|
||||
x0r = a[j] + a[j1];
|
||||
x0i = a[j + 1] + a[j1 + 1];
|
||||
x1r = a[j] - a[j1];
|
||||
x1i = a[j + 1] - a[j1 + 1];
|
||||
x2r = a[j2] + a[j3];
|
||||
x2i = a[j2 + 1] + a[j3 + 1];
|
||||
x3r = a[j2] - a[j3];
|
||||
x3i = a[j2 + 1] - a[j3 + 1];
|
||||
a[j] = x0r + x2r;
|
||||
a[j + 1] = x0i + x2i;
|
||||
a[j2] = x0r - x2r;
|
||||
a[j2 + 1] = x0i - x2i;
|
||||
a[j1] = x1r - x3i;
|
||||
a[j1 + 1] = x1i + x3r;
|
||||
a[j3] = x1r + x3i;
|
||||
a[j3 + 1] = x1i - x3r;
|
||||
}
|
||||
}
|
||||
|
||||
void OouraFft::bitrv2_128(float* a) const {
|
||||
/*
|
||||
Following things have been attempted but are no faster:
|
||||
(a) Storing the swap indexes in a LUT (index calculations are done
|
||||
for 'free' while waiting on memory/L1).
|
||||
(b) Consolidate the load/store of two consecutive floats by a 64 bit
|
||||
integer (execution is memory/L1 bound).
|
||||
(c) Do a mix of floats and 64 bit integer to maximize register
|
||||
utilization (execution is memory/L1 bound).
|
||||
(d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
|
||||
(e) Hard-coding of the offsets to completely eliminates index
|
||||
calculations.
|
||||
*/
|
||||
|
||||
unsigned int j, j1, k, k1;
|
||||
float xr, xi, yr, yi;
|
||||
|
||||
const int ip[4] = {0, 64, 32, 96};
|
||||
for (k = 0; k < 4; k++) {
|
||||
for (j = 0; j < k; j++) {
|
||||
j1 = 2 * j + ip[k];
|
||||
k1 = 2 * k + ip[j];
|
||||
xr = a[j1 + 0];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1 + 0];
|
||||
yi = a[k1 + 1];
|
||||
a[j1 + 0] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1 + 0] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
j1 += 8;
|
||||
k1 += 16;
|
||||
xr = a[j1 + 0];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1 + 0];
|
||||
yi = a[k1 + 1];
|
||||
a[j1 + 0] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1 + 0] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
j1 += 8;
|
||||
k1 -= 8;
|
||||
xr = a[j1 + 0];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1 + 0];
|
||||
yi = a[k1 + 1];
|
||||
a[j1 + 0] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1 + 0] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
j1 += 8;
|
||||
k1 += 16;
|
||||
xr = a[j1 + 0];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1 + 0];
|
||||
yi = a[k1 + 1];
|
||||
a[j1 + 0] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1 + 0] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
}
|
||||
j1 = 2 * k + 8 + ip[k];
|
||||
k1 = j1 + 8;
|
||||
xr = a[j1 + 0];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1 + 0];
|
||||
yi = a[k1 + 1];
|
||||
a[j1 + 0] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1 + 0] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace webrtc
|
64
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft.h
vendored
Normal file
64
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft.h
vendored
Normal file
@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_H_
|
||||
#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_H_
|
||||
|
||||
#include "rtc_base/system/arch.h"
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
void cft1st_128_SSE2(float* a);
|
||||
void cftmdl_128_SSE2(float* a);
|
||||
void rftfsub_128_SSE2(float* a);
|
||||
void rftbsub_128_SSE2(float* a);
|
||||
#endif
|
||||
|
||||
#if defined(MIPS_FPU_LE)
|
||||
void cft1st_128_mips(float* a);
|
||||
void cftmdl_128_mips(float* a);
|
||||
void rftfsub_128_mips(float* a);
|
||||
void rftbsub_128_mips(float* a);
|
||||
#endif
|
||||
|
||||
#if defined(WEBRTC_HAS_NEON)
|
||||
void cft1st_128_neon(float* a);
|
||||
void cftmdl_128_neon(float* a);
|
||||
void rftfsub_128_neon(float* a);
|
||||
void rftbsub_128_neon(float* a);
|
||||
#endif
|
||||
|
||||
class OouraFft {
|
||||
public:
|
||||
// Ctor allowing the availability of SSE2 support to be specified.
|
||||
explicit OouraFft(bool sse2_available);
|
||||
|
||||
// Deprecated: This Ctor will soon be removed.
|
||||
OouraFft();
|
||||
~OouraFft();
|
||||
void Fft(float* a) const;
|
||||
void InverseFft(float* a) const;
|
||||
|
||||
private:
|
||||
void cft1st_128(float* a) const;
|
||||
void cftmdl_128(float* a) const;
|
||||
void rftfsub_128(float* a) const;
|
||||
void rftbsub_128(float* a) const;
|
||||
|
||||
void cftfsub_128(float* a) const;
|
||||
void cftbsub_128(float* a) const;
|
||||
void bitrv2_128(float* a) const;
|
||||
bool use_sse2_;
|
||||
};
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
#endif // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_H_
|
1245
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft_mips.cc
vendored
Normal file
1245
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft_mips.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
351
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc
vendored
Normal file
351
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc
vendored
Normal file
@ -0,0 +1,351 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The rdft AEC algorithm, neon version of speed-critical functions.
|
||||
*
|
||||
* Based on the sse2 version.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
|
||||
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h"
|
||||
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h"
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
#if defined(WEBRTC_HAS_NEON)
|
||||
void cft1st_128_neon(float* a) {
|
||||
const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
|
||||
int j, k2;
|
||||
|
||||
for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
|
||||
float32x4_t a00v = vld1q_f32(&a[j + 0]);
|
||||
float32x4_t a04v = vld1q_f32(&a[j + 4]);
|
||||
float32x4_t a08v = vld1q_f32(&a[j + 8]);
|
||||
float32x4_t a12v = vld1q_f32(&a[j + 12]);
|
||||
float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
|
||||
float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
|
||||
float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
|
||||
float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
|
||||
const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
|
||||
const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
|
||||
const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
|
||||
const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
|
||||
const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
|
||||
const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
|
||||
float32x4_t x0v = vaddq_f32(a01v, a23v);
|
||||
const float32x4_t x1v = vsubq_f32(a01v, a23v);
|
||||
const float32x4_t x2v = vaddq_f32(a45v, a67v);
|
||||
const float32x4_t x3v = vsubq_f32(a45v, a67v);
|
||||
const float32x4_t x3w = vrev64q_f32(x3v);
|
||||
float32x4_t x0w;
|
||||
a01v = vaddq_f32(x0v, x2v);
|
||||
x0v = vsubq_f32(x0v, x2v);
|
||||
x0w = vrev64q_f32(x0v);
|
||||
a45v = vmulq_f32(wk2rv, x0v);
|
||||
a45v = vmlaq_f32(a45v, wk2iv, x0w);
|
||||
x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
|
||||
x0w = vrev64q_f32(x0v);
|
||||
a23v = vmulq_f32(wk1rv, x0v);
|
||||
a23v = vmlaq_f32(a23v, wk1iv, x0w);
|
||||
x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
|
||||
x0w = vrev64q_f32(x0v);
|
||||
a67v = vmulq_f32(wk3rv, x0v);
|
||||
a67v = vmlaq_f32(a67v, wk3iv, x0w);
|
||||
a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
|
||||
a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
|
||||
a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
|
||||
a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
|
||||
vst1q_f32(&a[j + 0], a00v);
|
||||
vst1q_f32(&a[j + 4], a04v);
|
||||
vst1q_f32(&a[j + 8], a08v);
|
||||
vst1q_f32(&a[j + 12], a12v);
|
||||
}
|
||||
}
|
||||
|
||||
void cftmdl_128_neon(float* a) {
|
||||
int j;
|
||||
const int l = 8;
|
||||
const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
|
||||
float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
|
||||
|
||||
for (j = 0; j < l; j += 2) {
|
||||
const float32x2_t a_00 = vld1_f32(&a[j + 0]);
|
||||
const float32x2_t a_08 = vld1_f32(&a[j + 8]);
|
||||
const float32x2_t a_32 = vld1_f32(&a[j + 32]);
|
||||
const float32x2_t a_40 = vld1_f32(&a[j + 40]);
|
||||
const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
|
||||
const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
|
||||
const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
|
||||
const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
|
||||
const float32x2_t a_16 = vld1_f32(&a[j + 16]);
|
||||
const float32x2_t a_24 = vld1_f32(&a[j + 24]);
|
||||
const float32x2_t a_48 = vld1_f32(&a[j + 48]);
|
||||
const float32x2_t a_56 = vld1_f32(&a[j + 56]);
|
||||
const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
|
||||
const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
|
||||
const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
|
||||
const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
|
||||
const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
|
||||
const float32x4_t x1_x3_add =
|
||||
vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
const float32x4_t x1_x3_sub =
|
||||
vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
|
||||
const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
|
||||
const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
|
||||
const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
|
||||
const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
|
||||
const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
|
||||
const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
|
||||
const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
|
||||
const float32x4_t xx1_rev = vrev64q_f32(xx1);
|
||||
const float32x4_t yy4_rev = vrev64q_f32(yy4);
|
||||
|
||||
vst1_f32(&a[j + 0], vget_low_f32(xx0));
|
||||
vst1_f32(&a[j + 32], vget_high_f32(xx0));
|
||||
vst1_f32(&a[j + 16], vget_low_f32(xx1));
|
||||
vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));
|
||||
|
||||
a[j + 48] = -a[j + 48];
|
||||
|
||||
vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
|
||||
vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
|
||||
vst1_f32(&a[j + 40], vget_low_f32(yy4));
|
||||
vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
|
||||
}
|
||||
|
||||
{
|
||||
const int k = 64;
|
||||
const int k1 = 2;
|
||||
const int k2 = 2 * k1;
|
||||
const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
|
||||
const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
|
||||
const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
|
||||
const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
|
||||
const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
|
||||
wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
|
||||
for (j = k; j < l + k; j += 2) {
|
||||
const float32x2_t a_00 = vld1_f32(&a[j + 0]);
|
||||
const float32x2_t a_08 = vld1_f32(&a[j + 8]);
|
||||
const float32x2_t a_32 = vld1_f32(&a[j + 32]);
|
||||
const float32x2_t a_40 = vld1_f32(&a[j + 40]);
|
||||
const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
|
||||
const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
|
||||
const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
|
||||
const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
|
||||
const float32x2_t a_16 = vld1_f32(&a[j + 16]);
|
||||
const float32x2_t a_24 = vld1_f32(&a[j + 24]);
|
||||
const float32x2_t a_48 = vld1_f32(&a[j + 48]);
|
||||
const float32x2_t a_56 = vld1_f32(&a[j + 56]);
|
||||
const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
|
||||
const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
|
||||
const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
|
||||
const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
|
||||
const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
|
||||
const float32x4_t x1_x3_add =
|
||||
vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
const float32x4_t x1_x3_sub =
|
||||
vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
|
||||
float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
|
||||
float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
|
||||
xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
|
||||
xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
|
||||
xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));
|
||||
|
||||
vst1_f32(&a[j + 0], vget_low_f32(xx));
|
||||
vst1_f32(&a[j + 32], vget_high_f32(xx));
|
||||
vst1_f32(&a[j + 16], vget_low_f32(xx4));
|
||||
vst1_f32(&a[j + 48], vget_high_f32(xx4));
|
||||
vst1_f32(&a[j + 8], vget_low_f32(xx12));
|
||||
vst1_f32(&a[j + 40], vget_high_f32(xx12));
|
||||
vst1_f32(&a[j + 24], vget_low_f32(xx22));
|
||||
vst1_f32(&a[j + 56], vget_high_f32(xx22));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
|
||||
// A B C D -> C D A B
|
||||
const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
|
||||
// C D A B -> D C B A
|
||||
return vrev64q_f32(rev);
|
||||
}
|
||||
|
||||
void rftfsub_128_neon(float* a) {
|
||||
const float* c = rdft_w + 32;
|
||||
int j1, j2;
|
||||
const float32x4_t mm_half = vdupq_n_f32(0.5f);
|
||||
|
||||
// Vectorized code (four at once).
|
||||
// Note: commented number are indexes for the first iteration of the loop.
|
||||
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
|
||||
// Load 'wk'.
|
||||
const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
|
||||
const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
|
||||
const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
|
||||
const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
|
||||
const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
|
||||
// Load and shuffle 'a'.
|
||||
// 2, 4, 6, 8, 3, 5, 7, 9
|
||||
float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
|
||||
// 120, 122, 124, 126, 121, 123, 125, 127,
|
||||
const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
|
||||
// 126, 124, 122, 120
|
||||
const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
|
||||
// 127, 125, 123, 121
|
||||
const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
|
||||
// Calculate 'x'.
|
||||
const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
|
||||
// 2-126, 4-124, 6-122, 8-120,
|
||||
const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
|
||||
// 3-127, 5-125, 7-123, 9-121,
|
||||
// Calculate product into 'y'.
|
||||
// yr = wkr * xr - wki * xi;
|
||||
// yi = wkr * xi + wki * xr;
|
||||
const float32x4_t a_ = vmulq_f32(wkr_, xr_);
|
||||
const float32x4_t b_ = vmulq_f32(wki_, xi_);
|
||||
const float32x4_t c_ = vmulq_f32(wkr_, xi_);
|
||||
const float32x4_t d_ = vmulq_f32(wki_, xr_);
|
||||
const float32x4_t yr_ = vsubq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,
|
||||
const float32x4_t yi_ = vaddq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,
|
||||
// Update 'a'.
|
||||
// a[j2 + 0] -= yr;
|
||||
// a[j2 + 1] -= yi;
|
||||
// a[k2 + 0] += yr;
|
||||
// a[k2 + 1] -= yi;
|
||||
// 126, 124, 122, 120,
|
||||
const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
|
||||
// 127, 125, 123, 121,
|
||||
const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);
|
||||
// Shuffle in right order and store.
|
||||
const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
|
||||
const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
|
||||
// 124, 125, 126, 127, 120, 121, 122, 123
|
||||
const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
|
||||
// 2, 4, 6, 8,
|
||||
a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
|
||||
// 3, 5, 7, 9,
|
||||
a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);
|
||||
// 2, 3, 4, 5, 6, 7, 8, 9,
|
||||
vst2q_f32(&a[0 + j2], a_j2_p);
|
||||
|
||||
vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
|
||||
vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
|
||||
}
|
||||
|
||||
// Scalar code for the remaining items.
|
||||
for (; j2 < 64; j1 += 1, j2 += 2) {
|
||||
const int k2 = 128 - j2;
|
||||
const int k1 = 32 - j1;
|
||||
const float wkr = 0.5f - c[k1];
|
||||
const float wki = c[j1];
|
||||
const float xr = a[j2 + 0] - a[k2 + 0];
|
||||
const float xi = a[j2 + 1] + a[k2 + 1];
|
||||
const float yr = wkr * xr - wki * xi;
|
||||
const float yi = wkr * xi + wki * xr;
|
||||
a[j2 + 0] -= yr;
|
||||
a[j2 + 1] -= yi;
|
||||
a[k2 + 0] += yr;
|
||||
a[k2 + 1] -= yi;
|
||||
}
|
||||
}
|
||||
|
||||
void rftbsub_128_neon(float* a) {
|
||||
const float* c = rdft_w + 32;
|
||||
int j1, j2;
|
||||
const float32x4_t mm_half = vdupq_n_f32(0.5f);
|
||||
|
||||
a[1] = -a[1];
|
||||
// Vectorized code (four at once).
|
||||
// Note: commented number are indexes for the first iteration of the loop.
|
||||
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
|
||||
// Load 'wk'.
|
||||
const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4,
|
||||
const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31,
|
||||
const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31,
|
||||
const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
|
||||
const float32x4_t wki_ = c_j1; // 1, 2, 3, 4,
|
||||
// Load and shuffle 'a'.
|
||||
// 2, 4, 6, 8, 3, 5, 7, 9
|
||||
float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
|
||||
// 120, 122, 124, 126, 121, 123, 125, 127,
|
||||
const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
|
||||
// 126, 124, 122, 120
|
||||
const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
|
||||
// 127, 125, 123, 121
|
||||
const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
|
||||
// Calculate 'x'.
|
||||
const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
|
||||
// 2-126, 4-124, 6-122, 8-120,
|
||||
const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
|
||||
// 3-127, 5-125, 7-123, 9-121,
|
||||
// Calculate product into 'y'.
|
||||
// yr = wkr * xr - wki * xi;
|
||||
// yi = wkr * xi + wki * xr;
|
||||
const float32x4_t a_ = vmulq_f32(wkr_, xr_);
|
||||
const float32x4_t b_ = vmulq_f32(wki_, xi_);
|
||||
const float32x4_t c_ = vmulq_f32(wkr_, xi_);
|
||||
const float32x4_t d_ = vmulq_f32(wki_, xr_);
|
||||
const float32x4_t yr_ = vaddq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120,
|
||||
const float32x4_t yi_ = vsubq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121,
|
||||
// Update 'a'.
|
||||
// a[j2 + 0] -= yr;
|
||||
// a[j2 + 1] -= yi;
|
||||
// a[k2 + 0] += yr;
|
||||
// a[k2 + 1] -= yi;
|
||||
// 126, 124, 122, 120,
|
||||
const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
|
||||
// 127, 125, 123, 121,
|
||||
const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
|
||||
// Shuffle in right order and store.
|
||||
// 2, 3, 4, 5, 6, 7, 8, 9,
|
||||
const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
|
||||
const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
|
||||
// 124, 125, 126, 127, 120, 121, 122, 123
|
||||
const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
|
||||
// 2, 4, 6, 8,
|
||||
a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
|
||||
// 3, 5, 7, 9,
|
||||
a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
|
||||
// 2, 3, 4, 5, 6, 7, 8, 9,
|
||||
vst2q_f32(&a[0 + j2], a_j2_p);
|
||||
|
||||
vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
|
||||
vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
|
||||
}
|
||||
|
||||
// Scalar code for the remaining items.
|
||||
for (; j2 < 64; j1 += 1, j2 += 2) {
|
||||
const int k2 = 128 - j2;
|
||||
const int k1 = 32 - j1;
|
||||
const float wkr = 0.5f - c[k1];
|
||||
const float wki = c[j1];
|
||||
const float xr = a[j2 + 0] - a[k2 + 0];
|
||||
const float xi = a[j2 + 1] + a[k2 + 1];
|
||||
const float yr = wkr * xr + wki * xi;
|
||||
const float yi = wkr * xi - wki * xr;
|
||||
a[j2 + 0] = a[j2 + 0] - yr;
|
||||
a[j2 + 1] = yi - a[j2 + 1];
|
||||
a[k2 + 0] = yr + a[k2 + 0];
|
||||
a[k2 + 1] = yi - a[k2 + 1];
|
||||
}
|
||||
a[65] = -a[65];
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace webrtc
|
439
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft_sse2.cc
vendored
Normal file
439
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft_sse2.cc
vendored
Normal file
@ -0,0 +1,439 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
|
||||
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
|
||||
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h"
|
||||
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h"
|
||||
#include "rtc_base/system/arch.h"
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
|
||||
namespace {
|
||||
// These intrinsics were unavailable before VS 2008.
|
||||
// TODO(andrew): move to a common file.
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1500
|
||||
static __inline __m128 _mm_castsi128_ps(__m128i a) {
|
||||
return *(__m128*)&a;
|
||||
}
|
||||
static __inline __m128i _mm_castps_si128(__m128 a) {
|
||||
return *(__m128i*)&a;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
void cft1st_128_SSE2(float* a) {
|
||||
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
|
||||
int j, k2;
|
||||
|
||||
for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
|
||||
__m128 a00v = _mm_loadu_ps(&a[j + 0]);
|
||||
__m128 a04v = _mm_loadu_ps(&a[j + 4]);
|
||||
__m128 a08v = _mm_loadu_ps(&a[j + 8]);
|
||||
__m128 a12v = _mm_loadu_ps(&a[j + 12]);
|
||||
__m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
__m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
__m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
__m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
|
||||
const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
|
||||
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
|
||||
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
|
||||
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
|
||||
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
|
||||
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
|
||||
__m128 x0v = _mm_add_ps(a01v, a23v);
|
||||
const __m128 x1v = _mm_sub_ps(a01v, a23v);
|
||||
const __m128 x2v = _mm_add_ps(a45v, a67v);
|
||||
const __m128 x3v = _mm_sub_ps(a45v, a67v);
|
||||
__m128 x0w;
|
||||
a01v = _mm_add_ps(x0v, x2v);
|
||||
x0v = _mm_sub_ps(x0v, x2v);
|
||||
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
{
|
||||
const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
|
||||
const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
|
||||
a45v = _mm_add_ps(a45_0v, a45_1v);
|
||||
}
|
||||
{
|
||||
__m128 a23_0v, a23_1v;
|
||||
const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
|
||||
x0v = _mm_add_ps(x1v, x3s);
|
||||
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
a23_0v = _mm_mul_ps(wk1rv, x0v);
|
||||
a23_1v = _mm_mul_ps(wk1iv, x0w);
|
||||
a23v = _mm_add_ps(a23_0v, a23_1v);
|
||||
|
||||
x0v = _mm_sub_ps(x1v, x3s);
|
||||
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
}
|
||||
{
|
||||
const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
|
||||
const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
|
||||
a67v = _mm_add_ps(a67_0v, a67_1v);
|
||||
}
|
||||
|
||||
a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
_mm_storeu_ps(&a[j + 0], a00v);
|
||||
_mm_storeu_ps(&a[j + 4], a04v);
|
||||
_mm_storeu_ps(&a[j + 8], a08v);
|
||||
_mm_storeu_ps(&a[j + 12], a12v);
|
||||
}
|
||||
}
|
||||
|
||||
void cftmdl_128_SSE2(float* a) {
|
||||
const int l = 8;
|
||||
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
|
||||
int j0;
|
||||
|
||||
__m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
|
||||
for (j0 = 0; j0 < l; j0 += 2) {
|
||||
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
|
||||
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
|
||||
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
|
||||
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
|
||||
const __m128 a_00_32 =
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
const __m128 a_08_40 =
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
|
||||
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
|
||||
|
||||
const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
|
||||
const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
|
||||
const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
|
||||
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
|
||||
const __m128 a_16_48 =
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
const __m128 a_24_56 =
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
|
||||
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
|
||||
|
||||
const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
|
||||
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
|
||||
_mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
|
||||
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
|
||||
|
||||
const __m128 yy0 =
|
||||
_mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
const __m128 yy1 =
|
||||
_mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
|
||||
const __m128 yy3 = _mm_add_ps(yy0, yy2);
|
||||
const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 32],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 48],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
|
||||
a[j0 + 48] = -a[j0 + 48];
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 56],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
|
||||
}
|
||||
|
||||
{
|
||||
int k = 64;
|
||||
int k1 = 2;
|
||||
int k2 = 2 * k1;
|
||||
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
|
||||
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
|
||||
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
|
||||
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
|
||||
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
|
||||
wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
|
||||
for (j0 = k; j0 < l + k; j0 += 2) {
|
||||
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
|
||||
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
|
||||
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
|
||||
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
|
||||
const __m128 a_00_32 =
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
const __m128 a_08_40 =
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
|
||||
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
|
||||
|
||||
const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
|
||||
const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
|
||||
const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
|
||||
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
|
||||
const __m128 a_16_48 =
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
const __m128 a_24_56 =
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),
|
||||
_MM_SHUFFLE(1, 0, 1, 0));
|
||||
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
|
||||
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
|
||||
|
||||
const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
|
||||
const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
|
||||
const __m128 xx3 = _mm_mul_ps(
|
||||
wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1),
|
||||
_MM_SHUFFLE(2, 3, 0, 1))));
|
||||
const __m128 xx4 = _mm_add_ps(xx2, xx3);
|
||||
|
||||
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
|
||||
_mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
|
||||
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
|
||||
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
|
||||
|
||||
const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
|
||||
const __m128 xx11 = _mm_mul_ps(
|
||||
wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
|
||||
_MM_SHUFFLE(2, 3, 0, 1))));
|
||||
const __m128 xx12 = _mm_add_ps(xx10, xx11);
|
||||
|
||||
const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
|
||||
const __m128 xx21 = _mm_mul_ps(
|
||||
wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
|
||||
_MM_SHUFFLE(2, 3, 0, 1))));
|
||||
const __m128 xx22 = _mm_add_ps(xx20, xx21);
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 32],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 48],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 40],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
|
||||
_mm_storel_epi64(
|
||||
(__m128i*)&a[j0 + 56],
|
||||
_mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void rftfsub_128_SSE2(float* a) {
|
||||
const float* c = rdft_w + 32;
|
||||
int j1, j2, k1, k2;
|
||||
float wkr, wki, xr, xi, yr, yi;
|
||||
|
||||
static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,
|
||||
0.5f};
|
||||
const __m128 mm_half = _mm_load_ps(k_half);
|
||||
|
||||
// Vectorized code (four at once).
|
||||
// Note: commented number are indexes for the first iteration of the loop.
|
||||
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
|
||||
// Load 'wk'.
|
||||
const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
|
||||
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
|
||||
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
|
||||
const __m128 wkr_ =
|
||||
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
|
||||
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
|
||||
// Load and shuffle 'a'.
|
||||
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
|
||||
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
|
||||
const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
|
||||
const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
|
||||
const __m128 a_j2_p0 = _mm_shuffle_ps(
|
||||
a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
|
||||
const __m128 a_j2_p1 = _mm_shuffle_ps(
|
||||
a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
|
||||
const __m128 a_k2_p0 = _mm_shuffle_ps(
|
||||
a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
|
||||
const __m128 a_k2_p1 = _mm_shuffle_ps(
|
||||
a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
|
||||
// Calculate 'x'.
|
||||
const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
|
||||
// 2-126, 4-124, 6-122, 8-120,
|
||||
const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
|
||||
// 3-127, 5-125, 7-123, 9-121,
|
||||
// Calculate product into 'y'.
|
||||
// yr = wkr * xr - wki * xi;
|
||||
// yi = wkr * xi + wki * xr;
|
||||
const __m128 a_ = _mm_mul_ps(wkr_, xr_);
|
||||
const __m128 b_ = _mm_mul_ps(wki_, xi_);
|
||||
const __m128 c_ = _mm_mul_ps(wkr_, xi_);
|
||||
const __m128 d_ = _mm_mul_ps(wki_, xr_);
|
||||
const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
|
||||
const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
|
||||
// Update 'a'.
|
||||
// a[j2 + 0] -= yr;
|
||||
// a[j2 + 1] -= yi;
|
||||
// a[k2 + 0] += yr;
|
||||
// a[k2 + 1] -= yi;
|
||||
const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
|
||||
const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_); // 3, 5, 7, 9,
|
||||
const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120,
|
||||
const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_); // 127, 125, 123, 121,
|
||||
// Shuffle in right order and store.
|
||||
const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
|
||||
// 2, 3, 4, 5,
|
||||
const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
|
||||
// 6, 7, 8, 9,
|
||||
const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
|
||||
// 122, 123, 120, 121,
|
||||
const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
|
||||
// 126, 127, 124, 125,
|
||||
const __m128 a_k2_0n = _mm_shuffle_ps(
|
||||
a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
|
||||
const __m128 a_k2_4n = _mm_shuffle_ps(
|
||||
a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
|
||||
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
|
||||
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
|
||||
_mm_storeu_ps(&a[122 - j2], a_k2_0n);
|
||||
_mm_storeu_ps(&a[126 - j2], a_k2_4n);
|
||||
}
|
||||
// Scalar code for the remaining items.
|
||||
for (; j2 < 64; j1 += 1, j2 += 2) {
|
||||
k2 = 128 - j2;
|
||||
k1 = 32 - j1;
|
||||
wkr = 0.5f - c[k1];
|
||||
wki = c[j1];
|
||||
xr = a[j2 + 0] - a[k2 + 0];
|
||||
xi = a[j2 + 1] + a[k2 + 1];
|
||||
yr = wkr * xr - wki * xi;
|
||||
yi = wkr * xi + wki * xr;
|
||||
a[j2 + 0] -= yr;
|
||||
a[j2 + 1] -= yi;
|
||||
a[k2 + 0] += yr;
|
||||
a[k2 + 1] -= yi;
|
||||
}
|
||||
}
|
||||
|
||||
void rftbsub_128_SSE2(float* a) {
|
||||
const float* c = rdft_w + 32;
|
||||
int j1, j2, k1, k2;
|
||||
float wkr, wki, xr, xi, yr, yi;
|
||||
|
||||
static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,
|
||||
0.5f};
|
||||
const __m128 mm_half = _mm_load_ps(k_half);
|
||||
|
||||
a[1] = -a[1];
|
||||
// Vectorized code (four at once).
|
||||
// Note: commented number are indexes for the first iteration of the loop.
|
||||
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
|
||||
// Load 'wk'.
|
||||
const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
|
||||
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
|
||||
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
|
||||
const __m128 wkr_ =
|
||||
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
|
||||
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
|
||||
// Load and shuffle 'a'.
|
||||
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
|
||||
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
|
||||
const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
|
||||
const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
|
||||
const __m128 a_j2_p0 = _mm_shuffle_ps(
|
||||
a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
|
||||
const __m128 a_j2_p1 = _mm_shuffle_ps(
|
||||
a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
|
||||
const __m128 a_k2_p0 = _mm_shuffle_ps(
|
||||
a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
|
||||
const __m128 a_k2_p1 = _mm_shuffle_ps(
|
||||
a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
|
||||
// Calculate 'x'.
|
||||
const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
|
||||
// 2-126, 4-124, 6-122, 8-120,
|
||||
const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
|
||||
// 3-127, 5-125, 7-123, 9-121,
|
||||
// Calculate product into 'y'.
|
||||
// yr = wkr * xr + wki * xi;
|
||||
// yi = wkr * xi - wki * xr;
|
||||
const __m128 a_ = _mm_mul_ps(wkr_, xr_);
|
||||
const __m128 b_ = _mm_mul_ps(wki_, xi_);
|
||||
const __m128 c_ = _mm_mul_ps(wkr_, xi_);
|
||||
const __m128 d_ = _mm_mul_ps(wki_, xr_);
|
||||
const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
|
||||
const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
|
||||
// Update 'a'.
|
||||
// a[j2 + 0] = a[j2 + 0] - yr;
|
||||
// a[j2 + 1] = yi - a[j2 + 1];
|
||||
// a[k2 + 0] = yr + a[k2 + 0];
|
||||
// a[k2 + 1] = yi - a[k2 + 1];
|
||||
const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
|
||||
const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9,
|
||||
const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120,
|
||||
const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121,
|
||||
// Shuffle in right order and store.
|
||||
const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
|
||||
// 2, 3, 4, 5,
|
||||
const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
|
||||
// 6, 7, 8, 9,
|
||||
const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
|
||||
// 122, 123, 120, 121,
|
||||
const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
|
||||
// 126, 127, 124, 125,
|
||||
const __m128 a_k2_0n = _mm_shuffle_ps(
|
||||
a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
|
||||
const __m128 a_k2_4n = _mm_shuffle_ps(
|
||||
a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
|
||||
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
|
||||
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
|
||||
_mm_storeu_ps(&a[122 - j2], a_k2_0n);
|
||||
_mm_storeu_ps(&a[126 - j2], a_k2_4n);
|
||||
}
|
||||
// Scalar code for the remaining items.
|
||||
for (; j2 < 64; j1 += 1, j2 += 2) {
|
||||
k2 = 128 - j2;
|
||||
k1 = 32 - j1;
|
||||
wkr = 0.5f - c[k1];
|
||||
wki = c[j1];
|
||||
xr = a[j2 + 0] - a[k2 + 0];
|
||||
xi = a[j2 + 1] + a[k2 + 1];
|
||||
yr = wkr * xr + wki * xi;
|
||||
yi = wkr * xi - wki * xr;
|
||||
a[j2 + 0] = a[j2 + 0] - yr;
|
||||
a[j2 + 1] = yi - a[j2 + 1];
|
||||
a[k2 + 0] = yr + a[k2 + 0];
|
||||
a[k2 + 1] = yi - a[k2 + 1];
|
||||
}
|
||||
a[65] = -a[65];
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace webrtc
|
54
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h
vendored
Normal file
54
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_COMMON_H_
|
||||
#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_COMMON_H_
|
||||
|
||||
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
// This tables used to be computed at run-time. For example, refer to:
|
||||
// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564
|
||||
// to see the initialization code.
|
||||
// Constants shared by all paths (C, SSE2, NEON).
|
||||
const float rdft_w[64] = {
|
||||
1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f, 0.9238795638f,
|
||||
0.3826834559f, 0.3826834559f, 0.9238795638f, 0.9807852507f, 0.1950903237f,
|
||||
0.5555702448f, 0.8314695954f, 0.8314695954f, 0.5555702448f, 0.1950903237f,
|
||||
0.9807852507f, 0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f,
|
||||
0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f, 0.9569403529f,
|
||||
0.2902846634f, 0.4713967443f, 0.8819212914f, 0.7730104327f, 0.6343933344f,
|
||||
0.0980171412f, 0.9951847196f, 0.7071067691f, 0.4993977249f, 0.4975923598f,
|
||||
0.4945882559f, 0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f,
|
||||
0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f, 0.4157347977f,
|
||||
0.4016037583f, 0.3865052164f, 0.3704755902f, 0.3535533845f, 0.3357794881f,
|
||||
0.3171966672f, 0.2978496552f, 0.2777851224f, 0.2570513785f, 0.2356983721f,
|
||||
0.2137775421f, 0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f,
|
||||
0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f,
|
||||
};
|
||||
|
||||
// Constants used by the C and MIPS paths.
|
||||
const float rdft_wk3ri_first[16] = {
|
||||
1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f,
|
||||
0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f,
|
||||
0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f,
|
||||
0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f,
|
||||
};
|
||||
const float rdft_wk3ri_second[16] = {
|
||||
-0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f,
|
||||
-0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f,
|
||||
-0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f,
|
||||
-0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f,
|
||||
};
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
#endif // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_COMMON_H_
|
98
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h
vendored
Normal file
98
webrtc/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h
vendored
Normal file
@ -0,0 +1,98 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
|
||||
#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
|
||||
|
||||
#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
|
||||
#include "rtc_base/system/arch.h"
|
||||
|
||||
#ifdef _MSC_VER /* visual c++ */
|
||||
#define ALIGN16_BEG __declspec(align(16))
|
||||
#define ALIGN16_END
|
||||
#else /* gcc or icc */
|
||||
#define ALIGN16_BEG
|
||||
#define ALIGN16_END __attribute__((aligned(16)))
|
||||
#endif
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
// These tables used to be computed at run-time. For example, refer to:
|
||||
// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564
|
||||
// to see the initialization code.
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY) || defined(WEBRTC_HAS_NEON)
|
||||
// Constants used by SSE2 and NEON but initialized in the C path.
|
||||
const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
|
||||
|
||||
ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
|
||||
1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, 0.923879564f,
|
||||
0.923879564f, 0.382683456f, 0.382683456f, 0.980785251f, 0.980785251f,
|
||||
0.555570245f, 0.555570245f, 0.831469595f, 0.831469595f, 0.195090324f,
|
||||
0.195090324f, 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
|
||||
0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, 0.956940353f,
|
||||
0.956940353f, 0.471396744f, 0.471396744f, 0.773010433f, 0.773010433f,
|
||||
0.098017141f, 0.098017141f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
|
||||
1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f, 0.707106769f,
|
||||
0.707106769f, -0.707106769f, -0.707106769f, 0.923879564f, 0.923879564f,
|
||||
-0.382683456f, -0.382683456f, 0.382683456f, 0.382683456f, -0.923879564f,
|
||||
-0.923879564f, 0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
|
||||
0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f, 0.831469595f,
|
||||
0.831469595f, -0.555570245f, -0.555570245f, 0.195090324f, 0.195090324f,
|
||||
-0.980785251f, -0.980785251f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
|
||||
1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f, 0.382683456f,
|
||||
0.382683456f, -0.923879564f, -0.923879564f, 0.831469536f, 0.831469536f,
|
||||
-0.980785251f, -0.980785251f, -0.195090353f, -0.195090353f, -0.555570245f,
|
||||
-0.555570245f, 0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
|
||||
0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f, 0.634393334f,
|
||||
0.634393334f, -0.995184720f, -0.995184720f, -0.471396863f, -0.471396863f,
|
||||
-0.290284693f, -0.290284693f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
|
||||
-0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, -0.382683456f,
|
||||
0.382683456f, -0.923879564f, 0.923879564f, -0.195090324f, 0.195090324f,
|
||||
-0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, -0.980785251f,
|
||||
0.980785251f, -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
|
||||
-0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f, -0.290284663f,
|
||||
0.290284663f, -0.881921291f, 0.881921291f, -0.634393334f, 0.634393334f,
|
||||
-0.995184720f, 0.995184720f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
|
||||
-0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f, -0.707106769f,
|
||||
0.707106769f, -0.707106769f, 0.707106769f, -0.382683456f, 0.382683456f,
|
||||
-0.923879564f, 0.923879564f, -0.923879564f, 0.923879564f, -0.382683456f,
|
||||
0.382683456f, -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
|
||||
-0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, -0.555570245f,
|
||||
0.555570245f, -0.831469595f, 0.831469595f, -0.980785251f, 0.980785251f,
|
||||
-0.195090324f, 0.195090324f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
|
||||
-0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, -0.923879564f,
|
||||
0.923879564f, 0.382683456f, -0.382683456f, -0.555570245f, 0.555570245f,
|
||||
-0.195090353f, 0.195090353f, -0.980785251f, 0.980785251f, 0.831469536f,
|
||||
-0.831469536f, -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
|
||||
-0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f, -0.773010492f,
|
||||
0.773010492f, 0.098017156f, -0.098017156f, -0.881921172f, 0.881921172f,
|
||||
0.956940353f, -0.956940353f,
|
||||
};
|
||||
ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
|
||||
0.707106769f,
|
||||
0.707106769f,
|
||||
0.707106769f,
|
||||
-0.707106769f,
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
#endif // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
|
866
webrtc/common_audio/third_party/ooura/fft_size_256/fft4g.cc
vendored
Normal file
866
webrtc/common_audio/third_party/ooura/fft_size_256/fft4g.cc
vendored
Normal file
@ -0,0 +1,866 @@
|
||||
/*
|
||||
* http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
|
||||
* Copyright Takuya OOURA, 1996-2001
|
||||
*
|
||||
* You may use, copy, modify and distribute this code for any purpose (include
|
||||
* commercial use) and without fee. Please refer to this package when you modify
|
||||
* this code.
|
||||
*
|
||||
* Changes:
|
||||
* Trivial type modifications by the WebRTC authors.
|
||||
*/
|
||||
|
||||
/*
|
||||
Fast Fourier/Cosine/Sine Transform
|
||||
dimension :one
|
||||
data length :power of 2
|
||||
decimation :frequency
|
||||
radix :4, 2
|
||||
data :inplace
|
||||
table :use
|
||||
functions
|
||||
cdft: Complex Discrete Fourier Transform
|
||||
rdft: Real Discrete Fourier Transform
|
||||
ddct: Discrete Cosine Transform
|
||||
ddst: Discrete Sine Transform
|
||||
dfct: Cosine Transform of RDFT (Real Symmetric DFT)
|
||||
dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
|
||||
function prototypes
|
||||
void cdft(int, int, float *, int *, float *);
|
||||
void rdft(size_t, int, float *, size_t *, float *);
|
||||
void ddct(int, int, float *, int *, float *);
|
||||
void ddst(int, int, float *, int *, float *);
|
||||
void dfct(int, float *, float *, int *, float *);
|
||||
void dfst(int, float *, float *, int *, float *);
|
||||
|
||||
|
||||
-------- Complex DFT (Discrete Fourier Transform) --------
|
||||
[definition]
|
||||
<case1>
|
||||
X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
|
||||
<case2>
|
||||
X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
|
||||
(notes: sum_j=0^n-1 is a summation from j=0 to n-1)
|
||||
[usage]
|
||||
<case1>
|
||||
ip[0] = 0; // first time only
|
||||
cdft(2*n, 1, a, ip, w);
|
||||
<case2>
|
||||
ip[0] = 0; // first time only
|
||||
cdft(2*n, -1, a, ip, w);
|
||||
[parameters]
|
||||
2*n :data length (int)
|
||||
n >= 1, n = power of 2
|
||||
a[0...2*n-1] :input/output data (float *)
|
||||
input data
|
||||
a[2*j] = Re(x[j]),
|
||||
a[2*j+1] = Im(x[j]), 0<=j<n
|
||||
output data
|
||||
a[2*k] = Re(X[k]),
|
||||
a[2*k+1] = Im(X[k]), 0<=k<n
|
||||
ip[0...*] :work area for bit reversal (int *)
|
||||
length of ip >= 2+sqrt(n)
|
||||
strictly,
|
||||
length of ip >=
|
||||
2+(1<<(int)(log(n+0.5)/log(2))/2).
|
||||
ip[0],ip[1] are pointers of the cos/sin table.
|
||||
w[0...n/2-1] :cos/sin table (float *)
|
||||
w[],ip[] are initialized if ip[0] == 0.
|
||||
[remark]
|
||||
Inverse of
|
||||
cdft(2*n, -1, a, ip, w);
|
||||
is
|
||||
cdft(2*n, 1, a, ip, w);
|
||||
for (j = 0; j <= 2 * n - 1; j++) {
|
||||
a[j] *= 1.0 / n;
|
||||
}
|
||||
.
|
||||
|
||||
|
||||
-------- Real DFT / Inverse of Real DFT --------
|
||||
[definition]
|
||||
<case1> RDFT
|
||||
R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
|
||||
I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
|
||||
<case2> IRDFT (excluding scale)
|
||||
a[k] = (R[0] + R[n/2]*cos(pi*k))/2 +
|
||||
sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) +
|
||||
sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
|
||||
[usage]
|
||||
<case1>
|
||||
ip[0] = 0; // first time only
|
||||
rdft(n, 1, a, ip, w);
|
||||
<case2>
|
||||
ip[0] = 0; // first time only
|
||||
rdft(n, -1, a, ip, w);
|
||||
[parameters]
|
||||
n :data length (size_t)
|
||||
n >= 2, n = power of 2
|
||||
a[0...n-1] :input/output data (float *)
|
||||
<case1>
|
||||
output data
|
||||
a[2*k] = R[k], 0<=k<n/2
|
||||
a[2*k+1] = I[k], 0<k<n/2
|
||||
a[1] = R[n/2]
|
||||
<case2>
|
||||
input data
|
||||
a[2*j] = R[j], 0<=j<n/2
|
||||
a[2*j+1] = I[j], 0<j<n/2
|
||||
a[1] = R[n/2]
|
||||
ip[0...*] :work area for bit reversal (size_t *)
|
||||
length of ip >= 2+sqrt(n/2)
|
||||
strictly,
|
||||
length of ip >=
|
||||
2+(1<<(int)(log(n/2+0.5)/log(2))/2).
|
||||
ip[0],ip[1] are pointers of the cos/sin table.
|
||||
w[0...n/2-1] :cos/sin table (float *)
|
||||
w[],ip[] are initialized if ip[0] == 0.
|
||||
[remark]
|
||||
Inverse of
|
||||
rdft(n, 1, a, ip, w);
|
||||
is
|
||||
rdft(n, -1, a, ip, w);
|
||||
for (j = 0; j <= n - 1; j++) {
|
||||
a[j] *= 2.0 / n;
|
||||
}
|
||||
.
|
||||
|
||||
|
||||
-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
|
||||
[definition]
|
||||
<case1> IDCT (excluding scale)
|
||||
C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
|
||||
<case2> DCT
|
||||
C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
|
||||
[usage]
|
||||
<case1>
|
||||
ip[0] = 0; // first time only
|
||||
ddct(n, 1, a, ip, w);
|
||||
<case2>
|
||||
ip[0] = 0; // first time only
|
||||
ddct(n, -1, a, ip, w);
|
||||
[parameters]
|
||||
n :data length (int)
|
||||
n >= 2, n = power of 2
|
||||
a[0...n-1] :input/output data (float *)
|
||||
output data
|
||||
a[k] = C[k], 0<=k<n
|
||||
ip[0...*] :work area for bit reversal (int *)
|
||||
length of ip >= 2+sqrt(n/2)
|
||||
strictly,
|
||||
length of ip >=
|
||||
2+(1<<(int)(log(n/2+0.5)/log(2))/2).
|
||||
ip[0],ip[1] are pointers of the cos/sin table.
|
||||
w[0...n*5/4-1] :cos/sin table (float *)
|
||||
w[],ip[] are initialized if ip[0] == 0.
|
||||
[remark]
|
||||
Inverse of
|
||||
ddct(n, -1, a, ip, w);
|
||||
is
|
||||
a[0] *= 0.5;
|
||||
ddct(n, 1, a, ip, w);
|
||||
for (j = 0; j <= n - 1; j++) {
|
||||
a[j] *= 2.0 / n;
|
||||
}
|
||||
.
|
||||
|
||||
|
||||
-------- DST (Discrete Sine Transform) / Inverse of DST --------
|
||||
[definition]
|
||||
<case1> IDST (excluding scale)
|
||||
S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
|
||||
<case2> DST
|
||||
S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
|
||||
[usage]
|
||||
<case1>
|
||||
ip[0] = 0; // first time only
|
||||
ddst(n, 1, a, ip, w);
|
||||
<case2>
|
||||
ip[0] = 0; // first time only
|
||||
ddst(n, -1, a, ip, w);
|
||||
[parameters]
|
||||
n :data length (int)
|
||||
n >= 2, n = power of 2
|
||||
a[0...n-1] :input/output data (float *)
|
||||
<case1>
|
||||
input data
|
||||
a[j] = A[j], 0<j<n
|
||||
a[0] = A[n]
|
||||
output data
|
||||
a[k] = S[k], 0<=k<n
|
||||
<case2>
|
||||
output data
|
||||
a[k] = S[k], 0<k<n
|
||||
a[0] = S[n]
|
||||
ip[0...*] :work area for bit reversal (int *)
|
||||
length of ip >= 2+sqrt(n/2)
|
||||
strictly,
|
||||
length of ip >=
|
||||
2+(1<<(int)(log(n/2+0.5)/log(2))/2).
|
||||
ip[0],ip[1] are pointers of the cos/sin table.
|
||||
w[0...n*5/4-1] :cos/sin table (float *)
|
||||
w[],ip[] are initialized if ip[0] == 0.
|
||||
[remark]
|
||||
Inverse of
|
||||
ddst(n, -1, a, ip, w);
|
||||
is
|
||||
a[0] *= 0.5;
|
||||
ddst(n, 1, a, ip, w);
|
||||
for (j = 0; j <= n - 1; j++) {
|
||||
a[j] *= 2.0 / n;
|
||||
}
|
||||
.
|
||||
|
||||
|
||||
-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
|
||||
[definition]
|
||||
C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
|
||||
[usage]
|
||||
ip[0] = 0; // first time only
|
||||
dfct(n, a, t, ip, w);
|
||||
[parameters]
|
||||
n :data length - 1 (int)
|
||||
n >= 2, n = power of 2
|
||||
a[0...n] :input/output data (float *)
|
||||
output data
|
||||
a[k] = C[k], 0<=k<=n
|
||||
t[0...n/2] :work area (float *)
|
||||
ip[0...*] :work area for bit reversal (int *)
|
||||
length of ip >= 2+sqrt(n/4)
|
||||
strictly,
|
||||
length of ip >=
|
||||
2+(1<<(int)(log(n/4+0.5)/log(2))/2).
|
||||
ip[0],ip[1] are pointers of the cos/sin table.
|
||||
w[0...n*5/8-1] :cos/sin table (float *)
|
||||
w[],ip[] are initialized if ip[0] == 0.
|
||||
[remark]
|
||||
Inverse of
|
||||
a[0] *= 0.5;
|
||||
a[n] *= 0.5;
|
||||
dfct(n, a, t, ip, w);
|
||||
is
|
||||
a[0] *= 0.5;
|
||||
a[n] *= 0.5;
|
||||
dfct(n, a, t, ip, w);
|
||||
for (j = 0; j <= n; j++) {
|
||||
a[j] *= 2.0 / n;
|
||||
}
|
||||
.
|
||||
|
||||
|
||||
-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
|
||||
[definition]
|
||||
S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
|
||||
[usage]
|
||||
ip[0] = 0; // first time only
|
||||
dfst(n, a, t, ip, w);
|
||||
[parameters]
|
||||
n :data length + 1 (int)
|
||||
n >= 2, n = power of 2
|
||||
a[0...n-1] :input/output data (float *)
|
||||
output data
|
||||
a[k] = S[k], 0<k<n
|
||||
(a[0] is used for work area)
|
||||
t[0...n/2-1] :work area (float *)
|
||||
ip[0...*] :work area for bit reversal (int *)
|
||||
length of ip >= 2+sqrt(n/4)
|
||||
strictly,
|
||||
length of ip >=
|
||||
2+(1<<(int)(log(n/4+0.5)/log(2))/2).
|
||||
ip[0],ip[1] are pointers of the cos/sin table.
|
||||
w[0...n*5/8-1] :cos/sin table (float *)
|
||||
w[],ip[] are initialized if ip[0] == 0.
|
||||
[remark]
|
||||
Inverse of
|
||||
dfst(n, a, t, ip, w);
|
||||
is
|
||||
dfst(n, a, t, ip, w);
|
||||
for (j = 1; j <= n - 1; j++) {
|
||||
a[j] *= 2.0 / n;
|
||||
}
|
||||
.
|
||||
|
||||
|
||||
Appendix :
|
||||
The cos/sin table is recalculated when the larger table required.
|
||||
w[] and ip[] are compatible with all routines.
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include "common_audio/third_party/ooura/fft_size_256/fft4g.h"
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
namespace {
|
||||
|
||||
void makewt(size_t nw, size_t* ip, float* w);
|
||||
void makect(size_t nc, size_t* ip, float* c);
|
||||
void bitrv2(size_t n, size_t* ip, float* a);
|
||||
void cftfsub(size_t n, float* a, float* w);
|
||||
void cftbsub(size_t n, float* a, float* w);
|
||||
void cft1st(size_t n, float* a, float* w);
|
||||
void cftmdl(size_t n, size_t l, float* a, float* w);
|
||||
void rftfsub(size_t n, float* a, size_t nc, float* c);
|
||||
void rftbsub(size_t n, float* a, size_t nc, float* c);
|
||||
|
||||
/* -------- initializing routines -------- */
|
||||
|
||||
void makewt(size_t nw, size_t* ip, float* w) {
|
||||
size_t j, nwh;
|
||||
float delta, x, y;
|
||||
|
||||
ip[0] = nw;
|
||||
ip[1] = 1;
|
||||
if (nw > 2) {
|
||||
nwh = nw >> 1;
|
||||
delta = atanf(1.0f) / nwh;
|
||||
w[0] = 1;
|
||||
w[1] = 0;
|
||||
w[nwh] = (float)cos(delta * nwh);
|
||||
w[nwh + 1] = w[nwh];
|
||||
if (nwh > 2) {
|
||||
for (j = 2; j < nwh; j += 2) {
|
||||
x = (float)cos(delta * j);
|
||||
y = (float)sin(delta * j);
|
||||
w[j] = x;
|
||||
w[j + 1] = y;
|
||||
w[nw - j] = y;
|
||||
w[nw - j + 1] = x;
|
||||
}
|
||||
bitrv2(nw, ip + 2, w);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void makect(size_t nc, size_t* ip, float* c) {
|
||||
size_t j, nch;
|
||||
float delta;
|
||||
|
||||
ip[1] = nc;
|
||||
if (nc > 1) {
|
||||
nch = nc >> 1;
|
||||
delta = atanf(1.0f) / nch;
|
||||
c[0] = (float)cos(delta * nch);
|
||||
c[nch] = 0.5f * c[0];
|
||||
for (j = 1; j < nch; j++) {
|
||||
c[j] = 0.5f * (float)cos(delta * j);
|
||||
c[nc - j] = 0.5f * (float)sin(delta * j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* -------- child routines -------- */
|
||||
|
||||
void bitrv2(size_t n, size_t* ip, float* a) {
|
||||
size_t j, j1, k, k1, l, m, m2;
|
||||
float xr, xi, yr, yi;
|
||||
|
||||
ip[0] = 0;
|
||||
l = n;
|
||||
m = 1;
|
||||
while ((m << 3) < l) {
|
||||
l >>= 1;
|
||||
for (j = 0; j < m; j++) {
|
||||
ip[m + j] = ip[j] + l;
|
||||
}
|
||||
m <<= 1;
|
||||
}
|
||||
m2 = 2 * m;
|
||||
if ((m << 3) == l) {
|
||||
for (k = 0; k < m; k++) {
|
||||
for (j = 0; j < k; j++) {
|
||||
j1 = 2 * j + ip[k];
|
||||
k1 = 2 * k + ip[j];
|
||||
xr = a[j1];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1];
|
||||
yi = a[k1 + 1];
|
||||
a[j1] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
j1 += m2;
|
||||
k1 += 2 * m2;
|
||||
xr = a[j1];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1];
|
||||
yi = a[k1 + 1];
|
||||
a[j1] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
j1 += m2;
|
||||
k1 -= m2;
|
||||
xr = a[j1];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1];
|
||||
yi = a[k1 + 1];
|
||||
a[j1] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
j1 += m2;
|
||||
k1 += 2 * m2;
|
||||
xr = a[j1];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1];
|
||||
yi = a[k1 + 1];
|
||||
a[j1] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
}
|
||||
j1 = 2 * k + m2 + ip[k];
|
||||
k1 = j1 + m2;
|
||||
xr = a[j1];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1];
|
||||
yi = a[k1 + 1];
|
||||
a[j1] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
}
|
||||
} else {
|
||||
for (k = 1; k < m; k++) {
|
||||
for (j = 0; j < k; j++) {
|
||||
j1 = 2 * j + ip[k];
|
||||
k1 = 2 * k + ip[j];
|
||||
xr = a[j1];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1];
|
||||
yi = a[k1 + 1];
|
||||
a[j1] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
j1 += m2;
|
||||
k1 += m2;
|
||||
xr = a[j1];
|
||||
xi = a[j1 + 1];
|
||||
yr = a[k1];
|
||||
yi = a[k1 + 1];
|
||||
a[j1] = yr;
|
||||
a[j1 + 1] = yi;
|
||||
a[k1] = xr;
|
||||
a[k1 + 1] = xi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cftfsub(size_t n, float* a, float* w) {
|
||||
size_t j, j1, j2, j3, l;
|
||||
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
||||
|
||||
l = 2;
|
||||
if (n > 8) {
|
||||
cft1st(n, a, w);
|
||||
l = 8;
|
||||
while ((l << 2) < n) {
|
||||
cftmdl(n, l, a, w);
|
||||
l <<= 2;
|
||||
}
|
||||
}
|
||||
if ((l << 2) == n) {
|
||||
for (j = 0; j < l; j += 2) {
|
||||
j1 = j + l;
|
||||
j2 = j1 + l;
|
||||
j3 = j2 + l;
|
||||
x0r = a[j] + a[j1];
|
||||
x0i = a[j + 1] + a[j1 + 1];
|
||||
x1r = a[j] - a[j1];
|
||||
x1i = a[j + 1] - a[j1 + 1];
|
||||
x2r = a[j2] + a[j3];
|
||||
x2i = a[j2 + 1] + a[j3 + 1];
|
||||
x3r = a[j2] - a[j3];
|
||||
x3i = a[j2 + 1] - a[j3 + 1];
|
||||
a[j] = x0r + x2r;
|
||||
a[j + 1] = x0i + x2i;
|
||||
a[j2] = x0r - x2r;
|
||||
a[j2 + 1] = x0i - x2i;
|
||||
a[j1] = x1r - x3i;
|
||||
a[j1 + 1] = x1i + x3r;
|
||||
a[j3] = x1r + x3i;
|
||||
a[j3 + 1] = x1i - x3r;
|
||||
}
|
||||
} else {
|
||||
for (j = 0; j < l; j += 2) {
|
||||
j1 = j + l;
|
||||
x0r = a[j] - a[j1];
|
||||
x0i = a[j + 1] - a[j1 + 1];
|
||||
a[j] += a[j1];
|
||||
a[j + 1] += a[j1 + 1];
|
||||
a[j1] = x0r;
|
||||
a[j1 + 1] = x0i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cftbsub(size_t n, float* a, float* w) {
|
||||
size_t j, j1, j2, j3, l;
|
||||
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
||||
|
||||
l = 2;
|
||||
if (n > 8) {
|
||||
cft1st(n, a, w);
|
||||
l = 8;
|
||||
while ((l << 2) < n) {
|
||||
cftmdl(n, l, a, w);
|
||||
l <<= 2;
|
||||
}
|
||||
}
|
||||
if ((l << 2) == n) {
|
||||
for (j = 0; j < l; j += 2) {
|
||||
j1 = j + l;
|
||||
j2 = j1 + l;
|
||||
j3 = j2 + l;
|
||||
x0r = a[j] + a[j1];
|
||||
x0i = -a[j + 1] - a[j1 + 1];
|
||||
x1r = a[j] - a[j1];
|
||||
x1i = -a[j + 1] + a[j1 + 1];
|
||||
x2r = a[j2] + a[j3];
|
||||
x2i = a[j2 + 1] + a[j3 + 1];
|
||||
x3r = a[j2] - a[j3];
|
||||
x3i = a[j2 + 1] - a[j3 + 1];
|
||||
a[j] = x0r + x2r;
|
||||
a[j + 1] = x0i - x2i;
|
||||
a[j2] = x0r - x2r;
|
||||
a[j2 + 1] = x0i + x2i;
|
||||
a[j1] = x1r - x3i;
|
||||
a[j1 + 1] = x1i - x3r;
|
||||
a[j3] = x1r + x3i;
|
||||
a[j3 + 1] = x1i + x3r;
|
||||
}
|
||||
} else {
|
||||
for (j = 0; j < l; j += 2) {
|
||||
j1 = j + l;
|
||||
x0r = a[j] - a[j1];
|
||||
x0i = -a[j + 1] + a[j1 + 1];
|
||||
a[j] += a[j1];
|
||||
a[j + 1] = -a[j + 1] - a[j1 + 1];
|
||||
a[j1] = x0r;
|
||||
a[j1 + 1] = x0i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cft1st(size_t n, float* a, float* w) {
|
||||
size_t j, k1, k2;
|
||||
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
|
||||
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
||||
|
||||
x0r = a[0] + a[2];
|
||||
x0i = a[1] + a[3];
|
||||
x1r = a[0] - a[2];
|
||||
x1i = a[1] - a[3];
|
||||
x2r = a[4] + a[6];
|
||||
x2i = a[5] + a[7];
|
||||
x3r = a[4] - a[6];
|
||||
x3i = a[5] - a[7];
|
||||
a[0] = x0r + x2r;
|
||||
a[1] = x0i + x2i;
|
||||
a[4] = x0r - x2r;
|
||||
a[5] = x0i - x2i;
|
||||
a[2] = x1r - x3i;
|
||||
a[3] = x1i + x3r;
|
||||
a[6] = x1r + x3i;
|
||||
a[7] = x1i - x3r;
|
||||
wk1r = w[2];
|
||||
x0r = a[8] + a[10];
|
||||
x0i = a[9] + a[11];
|
||||
x1r = a[8] - a[10];
|
||||
x1i = a[9] - a[11];
|
||||
x2r = a[12] + a[14];
|
||||
x2i = a[13] + a[15];
|
||||
x3r = a[12] - a[14];
|
||||
x3i = a[13] - a[15];
|
||||
a[8] = x0r + x2r;
|
||||
a[9] = x0i + x2i;
|
||||
a[12] = x2i - x0i;
|
||||
a[13] = x0r - x2r;
|
||||
x0r = x1r - x3i;
|
||||
x0i = x1i + x3r;
|
||||
a[10] = wk1r * (x0r - x0i);
|
||||
a[11] = wk1r * (x0r + x0i);
|
||||
x0r = x3i + x1r;
|
||||
x0i = x3r - x1i;
|
||||
a[14] = wk1r * (x0i - x0r);
|
||||
a[15] = wk1r * (x0i + x0r);
|
||||
k1 = 0;
|
||||
for (j = 16; j < n; j += 16) {
|
||||
k1 += 2;
|
||||
k2 = 2 * k1;
|
||||
wk2r = w[k1];
|
||||
wk2i = w[k1 + 1];
|
||||
wk1r = w[k2];
|
||||
wk1i = w[k2 + 1];
|
||||
wk3r = wk1r - 2 * wk2i * wk1i;
|
||||
wk3i = 2 * wk2i * wk1r - wk1i;
|
||||
x0r = a[j] + a[j + 2];
|
||||
x0i = a[j + 1] + a[j + 3];
|
||||
x1r = a[j] - a[j + 2];
|
||||
x1i = a[j + 1] - a[j + 3];
|
||||
x2r = a[j + 4] + a[j + 6];
|
||||
x2i = a[j + 5] + a[j + 7];
|
||||
x3r = a[j + 4] - a[j + 6];
|
||||
x3i = a[j + 5] - a[j + 7];
|
||||
a[j] = x0r + x2r;
|
||||
a[j + 1] = x0i + x2i;
|
||||
x0r -= x2r;
|
||||
x0i -= x2i;
|
||||
a[j + 4] = wk2r * x0r - wk2i * x0i;
|
||||
a[j + 5] = wk2r * x0i + wk2i * x0r;
|
||||
x0r = x1r - x3i;
|
||||
x0i = x1i + x3r;
|
||||
a[j + 2] = wk1r * x0r - wk1i * x0i;
|
||||
a[j + 3] = wk1r * x0i + wk1i * x0r;
|
||||
x0r = x1r + x3i;
|
||||
x0i = x1i - x3r;
|
||||
a[j + 6] = wk3r * x0r - wk3i * x0i;
|
||||
a[j + 7] = wk3r * x0i + wk3i * x0r;
|
||||
wk1r = w[k2 + 2];
|
||||
wk1i = w[k2 + 3];
|
||||
wk3r = wk1r - 2 * wk2r * wk1i;
|
||||
wk3i = 2 * wk2r * wk1r - wk1i;
|
||||
x0r = a[j + 8] + a[j + 10];
|
||||
x0i = a[j + 9] + a[j + 11];
|
||||
x1r = a[j + 8] - a[j + 10];
|
||||
x1i = a[j + 9] - a[j + 11];
|
||||
x2r = a[j + 12] + a[j + 14];
|
||||
x2i = a[j + 13] + a[j + 15];
|
||||
x3r = a[j + 12] - a[j + 14];
|
||||
x3i = a[j + 13] - a[j + 15];
|
||||
a[j + 8] = x0r + x2r;
|
||||
a[j + 9] = x0i + x2i;
|
||||
x0r -= x2r;
|
||||
x0i -= x2i;
|
||||
a[j + 12] = -wk2i * x0r - wk2r * x0i;
|
||||
a[j + 13] = -wk2i * x0i + wk2r * x0r;
|
||||
x0r = x1r - x3i;
|
||||
x0i = x1i + x3r;
|
||||
a[j + 10] = wk1r * x0r - wk1i * x0i;
|
||||
a[j + 11] = wk1r * x0i + wk1i * x0r;
|
||||
x0r = x1r + x3i;
|
||||
x0i = x1i - x3r;
|
||||
a[j + 14] = wk3r * x0r - wk3i * x0i;
|
||||
a[j + 15] = wk3r * x0i + wk3i * x0r;
|
||||
}
|
||||
}
|
||||
|
||||
void cftmdl(size_t n, size_t l, float* a, float* w) {
|
||||
size_t j, j1, j2, j3, k, k1, k2, m, m2;
|
||||
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
|
||||
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
||||
|
||||
m = l << 2;
|
||||
for (j = 0; j < l; j += 2) {
|
||||
j1 = j + l;
|
||||
j2 = j1 + l;
|
||||
j3 = j2 + l;
|
||||
x0r = a[j] + a[j1];
|
||||
x0i = a[j + 1] + a[j1 + 1];
|
||||
x1r = a[j] - a[j1];
|
||||
x1i = a[j + 1] - a[j1 + 1];
|
||||
x2r = a[j2] + a[j3];
|
||||
x2i = a[j2 + 1] + a[j3 + 1];
|
||||
x3r = a[j2] - a[j3];
|
||||
x3i = a[j2 + 1] - a[j3 + 1];
|
||||
a[j] = x0r + x2r;
|
||||
a[j + 1] = x0i + x2i;
|
||||
a[j2] = x0r - x2r;
|
||||
a[j2 + 1] = x0i - x2i;
|
||||
a[j1] = x1r - x3i;
|
||||
a[j1 + 1] = x1i + x3r;
|
||||
a[j3] = x1r + x3i;
|
||||
a[j3 + 1] = x1i - x3r;
|
||||
}
|
||||
wk1r = w[2];
|
||||
for (j = m; j < l + m; j += 2) {
|
||||
j1 = j + l;
|
||||
j2 = j1 + l;
|
||||
j3 = j2 + l;
|
||||
x0r = a[j] + a[j1];
|
||||
x0i = a[j + 1] + a[j1 + 1];
|
||||
x1r = a[j] - a[j1];
|
||||
x1i = a[j + 1] - a[j1 + 1];
|
||||
x2r = a[j2] + a[j3];
|
||||
x2i = a[j2 + 1] + a[j3 + 1];
|
||||
x3r = a[j2] - a[j3];
|
||||
x3i = a[j2 + 1] - a[j3 + 1];
|
||||
a[j] = x0r + x2r;
|
||||
a[j + 1] = x0i + x2i;
|
||||
a[j2] = x2i - x0i;
|
||||
a[j2 + 1] = x0r - x2r;
|
||||
x0r = x1r - x3i;
|
||||
x0i = x1i + x3r;
|
||||
a[j1] = wk1r * (x0r - x0i);
|
||||
a[j1 + 1] = wk1r * (x0r + x0i);
|
||||
x0r = x3i + x1r;
|
||||
x0i = x3r - x1i;
|
||||
a[j3] = wk1r * (x0i - x0r);
|
||||
a[j3 + 1] = wk1r * (x0i + x0r);
|
||||
}
|
||||
k1 = 0;
|
||||
m2 = 2 * m;
|
||||
for (k = m2; k < n; k += m2) {
|
||||
k1 += 2;
|
||||
k2 = 2 * k1;
|
||||
wk2r = w[k1];
|
||||
wk2i = w[k1 + 1];
|
||||
wk1r = w[k2];
|
||||
wk1i = w[k2 + 1];
|
||||
wk3r = wk1r - 2 * wk2i * wk1i;
|
||||
wk3i = 2 * wk2i * wk1r - wk1i;
|
||||
for (j = k; j < l + k; j += 2) {
|
||||
j1 = j + l;
|
||||
j2 = j1 + l;
|
||||
j3 = j2 + l;
|
||||
x0r = a[j] + a[j1];
|
||||
x0i = a[j + 1] + a[j1 + 1];
|
||||
x1r = a[j] - a[j1];
|
||||
x1i = a[j + 1] - a[j1 + 1];
|
||||
x2r = a[j2] + a[j3];
|
||||
x2i = a[j2 + 1] + a[j3 + 1];
|
||||
x3r = a[j2] - a[j3];
|
||||
x3i = a[j2 + 1] - a[j3 + 1];
|
||||
a[j] = x0r + x2r;
|
||||
a[j + 1] = x0i + x2i;
|
||||
x0r -= x2r;
|
||||
x0i -= x2i;
|
||||
a[j2] = wk2r * x0r - wk2i * x0i;
|
||||
a[j2 + 1] = wk2r * x0i + wk2i * x0r;
|
||||
x0r = x1r - x3i;
|
||||
x0i = x1i + x3r;
|
||||
a[j1] = wk1r * x0r - wk1i * x0i;
|
||||
a[j1 + 1] = wk1r * x0i + wk1i * x0r;
|
||||
x0r = x1r + x3i;
|
||||
x0i = x1i - x3r;
|
||||
a[j3] = wk3r * x0r - wk3i * x0i;
|
||||
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
|
||||
}
|
||||
wk1r = w[k2 + 2];
|
||||
wk1i = w[k2 + 3];
|
||||
wk3r = wk1r - 2 * wk2r * wk1i;
|
||||
wk3i = 2 * wk2r * wk1r - wk1i;
|
||||
for (j = k + m; j < l + (k + m); j += 2) {
|
||||
j1 = j + l;
|
||||
j2 = j1 + l;
|
||||
j3 = j2 + l;
|
||||
x0r = a[j] + a[j1];
|
||||
x0i = a[j + 1] + a[j1 + 1];
|
||||
x1r = a[j] - a[j1];
|
||||
x1i = a[j + 1] - a[j1 + 1];
|
||||
x2r = a[j2] + a[j3];
|
||||
x2i = a[j2 + 1] + a[j3 + 1];
|
||||
x3r = a[j2] - a[j3];
|
||||
x3i = a[j2 + 1] - a[j3 + 1];
|
||||
a[j] = x0r + x2r;
|
||||
a[j + 1] = x0i + x2i;
|
||||
x0r -= x2r;
|
||||
x0i -= x2i;
|
||||
a[j2] = -wk2i * x0r - wk2r * x0i;
|
||||
a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
|
||||
x0r = x1r - x3i;
|
||||
x0i = x1i + x3r;
|
||||
a[j1] = wk1r * x0r - wk1i * x0i;
|
||||
a[j1 + 1] = wk1r * x0i + wk1i * x0r;
|
||||
x0r = x1r + x3i;
|
||||
x0i = x1i - x3r;
|
||||
a[j3] = wk3r * x0r - wk3i * x0i;
|
||||
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void rftfsub(size_t n, float* a, size_t nc, float* c) {
|
||||
size_t j, k, kk, ks, m;
|
||||
float wkr, wki, xr, xi, yr, yi;
|
||||
|
||||
m = n >> 1;
|
||||
ks = 2 * nc / m;
|
||||
kk = 0;
|
||||
for (j = 2; j < m; j += 2) {
|
||||
k = n - j;
|
||||
kk += ks;
|
||||
wkr = 0.5f - c[nc - kk];
|
||||
wki = c[kk];
|
||||
xr = a[j] - a[k];
|
||||
xi = a[j + 1] + a[k + 1];
|
||||
yr = wkr * xr - wki * xi;
|
||||
yi = wkr * xi + wki * xr;
|
||||
a[j] -= yr;
|
||||
a[j + 1] -= yi;
|
||||
a[k] += yr;
|
||||
a[k + 1] -= yi;
|
||||
}
|
||||
}
|
||||
|
||||
void rftbsub(size_t n, float* a, size_t nc, float* c) {
|
||||
size_t j, k, kk, ks, m;
|
||||
float wkr, wki, xr, xi, yr, yi;
|
||||
|
||||
a[1] = -a[1];
|
||||
m = n >> 1;
|
||||
ks = 2 * nc / m;
|
||||
kk = 0;
|
||||
for (j = 2; j < m; j += 2) {
|
||||
k = n - j;
|
||||
kk += ks;
|
||||
wkr = 0.5f - c[nc - kk];
|
||||
wki = c[kk];
|
||||
xr = a[j] - a[k];
|
||||
xi = a[j + 1] + a[k + 1];
|
||||
yr = wkr * xr + wki * xi;
|
||||
yi = wkr * xi - wki * xr;
|
||||
a[j] -= yr;
|
||||
a[j + 1] = yi - a[j + 1];
|
||||
a[k] += yr;
|
||||
a[k + 1] = yi - a[k + 1];
|
||||
}
|
||||
a[m + 1] = -a[m + 1];
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void WebRtc_rdft(size_t n, int isgn, float* a, size_t* ip, float* w) {
|
||||
size_t nw, nc;
|
||||
float xi;
|
||||
|
||||
nw = ip[0];
|
||||
if (n > (nw << 2)) {
|
||||
nw = n >> 2;
|
||||
makewt(nw, ip, w);
|
||||
}
|
||||
nc = ip[1];
|
||||
if (n > (nc << 2)) {
|
||||
nc = n >> 2;
|
||||
makect(nc, ip, w + nw);
|
||||
}
|
||||
if (isgn >= 0) {
|
||||
if (n > 4) {
|
||||
bitrv2(n, ip + 2, a);
|
||||
cftfsub(n, a, w);
|
||||
rftfsub(n, a, nc, w + nw);
|
||||
} else if (n == 4) {
|
||||
cftfsub(n, a, w);
|
||||
}
|
||||
xi = a[0] - a[1];
|
||||
a[0] += a[1];
|
||||
a[1] = xi;
|
||||
} else {
|
||||
a[1] = 0.5f * (a[0] - a[1]);
|
||||
a[0] -= a[1];
|
||||
if (n > 4) {
|
||||
rftbsub(n, a, nc, w + nw);
|
||||
bitrv2(n, ip + 2, a);
|
||||
cftbsub(n, a, w);
|
||||
} else if (n == 4) {
|
||||
cftfsub(n, a, w);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace webrtc
|
21
webrtc/common_audio/third_party/ooura/fft_size_256/fft4g.h
vendored
Normal file
21
webrtc/common_audio/third_party/ooura/fft_size_256/fft4g.h
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
/*
|
||||
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the ../../../LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef COMMON_AUDIO_THIRD_PARTY_OOURA_FFT_SIZE_256_FFT4G_H_
|
||||
#define COMMON_AUDIO_THIRD_PARTY_OOURA_FFT_SIZE_256_FFT4G_H_
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
// Refer to fft4g.c for documentation.
|
||||
void WebRtc_rdft(size_t n, int isgn, float* a, size_t* ip, float* w);
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
#endif // COMMON_AUDIO_THIRD_PARTY_OOURA_FFT_SIZE_256_FFT4G_H_
|
24
webrtc/common_audio/third_party/spl_sqrt_floor/BUILD.gn
vendored
Normal file
24
webrtc/common_audio/third_party/spl_sqrt_floor/BUILD.gn
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
# Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
||||
#
|
||||
# Use of this source code is governed by a BSD-style license
|
||||
# that can be found in the ../../../LICENSE file in the root of the source
|
||||
# tree. An additional intellectual property rights grant can be found
|
||||
# in the file PATENTS. All contributing project authors may
|
||||
# be found in the AUTHORS file in the root of the source tree.
|
||||
|
||||
import("../../../webrtc.gni")
|
||||
|
||||
rtc_library("spl_sqrt_floor") {
|
||||
visibility = [ "../..:common_audio_c" ]
|
||||
sources = [ "spl_sqrt_floor.h" ]
|
||||
deps = []
|
||||
if (current_cpu == "arm") {
|
||||
sources += [ "spl_sqrt_floor_arm.S" ]
|
||||
|
||||
deps += [ "../../../rtc_base/system:asm_defines" ]
|
||||
} else if (current_cpu == "mipsel") {
|
||||
sources += [ "spl_sqrt_floor_mips.c" ]
|
||||
} else {
|
||||
sources += [ "spl_sqrt_floor.c" ]
|
||||
}
|
||||
}
|
27
webrtc/common_audio/third_party/spl_sqrt_floor/LICENSE
vendored
Normal file
27
webrtc/common_audio/third_party/spl_sqrt_floor/LICENSE
vendored
Normal file
@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Written by Wilco Dijkstra, 1996. The following email exchange establishes the
|
||||
* license.
|
||||
*
|
||||
* From: Wilco Dijkstra <Wilco.Dijkstra@ntlworld.com>
|
||||
* Date: Fri, Jun 24, 2011 at 3:20 AM
|
||||
* Subject: Re: sqrt routine
|
||||
* To: Kevin Ma <kma@google.com>
|
||||
* Hi Kevin,
|
||||
* Thanks for asking. Those routines are public domain (originally posted to
|
||||
* comp.sys.arm a long time ago), so you can use them freely for any purpose.
|
||||
* Cheers,
|
||||
* Wilco
|
||||
*
|
||||
* ----- Original Message -----
|
||||
* From: "Kevin Ma" <kma@google.com>
|
||||
* To: <Wilco.Dijkstra@ntlworld.com>
|
||||
* Sent: Thursday, June 23, 2011 11:44 PM
|
||||
* Subject: Fwd: sqrt routine
|
||||
* Hi Wilco,
|
||||
* I saw your sqrt routine from several web sites, including
|
||||
* http://www.finesse.demon.co.uk/steven/sqrt.html.
|
||||
* Just wonder if there's any copyright information with your Successive
|
||||
* approximation routines, or if I can freely use it for any purpose.
|
||||
* Thanks.
|
||||
* Kevin
|
||||
*/
|
77
webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c
vendored
Normal file
77
webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c
vendored
Normal file
@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Written by Wilco Dijkstra, 1996. The following email exchange establishes the
|
||||
* license.
|
||||
*
|
||||
* From: Wilco Dijkstra <Wilco.Dijkstra@ntlworld.com>
|
||||
* Date: Fri, Jun 24, 2011 at 3:20 AM
|
||||
* Subject: Re: sqrt routine
|
||||
* To: Kevin Ma <kma@google.com>
|
||||
* Hi Kevin,
|
||||
* Thanks for asking. Those routines are public domain (originally posted to
|
||||
* comp.sys.arm a long time ago), so you can use them freely for any purpose.
|
||||
* Cheers,
|
||||
* Wilco
|
||||
*
|
||||
* ----- Original Message -----
|
||||
* From: "Kevin Ma" <kma@google.com>
|
||||
* To: <Wilco.Dijkstra@ntlworld.com>
|
||||
* Sent: Thursday, June 23, 2011 11:44 PM
|
||||
* Subject: Fwd: sqrt routine
|
||||
* Hi Wilco,
|
||||
* I saw your sqrt routine from several web sites, including
|
||||
* http://www.finesse.demon.co.uk/steven/sqrt.html.
|
||||
* Just wonder if there's any copyright information with your Successive
|
||||
* approximation routines, or if I can freely use it for any purpose.
|
||||
* Thanks.
|
||||
* Kevin
|
||||
*/
|
||||
|
||||
// Minor modifications in code style for WebRTC, 2012.
|
||||
|
||||
#include "common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h"
|
||||
|
||||
/*
|
||||
* Algorithm:
|
||||
* Successive approximation of the equation (root + delta) ^ 2 = N
|
||||
* until delta < 1. If delta < 1 we have the integer part of SQRT (N).
|
||||
* Use delta = 2^i for i = 15 .. 0.
|
||||
*
|
||||
* Output precision is 16 bits. Note for large input values (close to
|
||||
* 0x7FFFFFFF), bit 15 (the highest bit of the low 16-bit half word)
|
||||
* contains the MSB information (a non-sign value). Do with caution
|
||||
* if you need to cast the output to int16_t type.
|
||||
*
|
||||
* If the input value is negative, it returns 0.
|
||||
*/
|
||||
|
||||
#define WEBRTC_SPL_SQRT_ITER(N) \
|
||||
try1 = root + (1 << (N)); \
|
||||
if (value >= try1 << (N)) \
|
||||
{ \
|
||||
value -= try1 << (N); \
|
||||
root |= 2 << (N); \
|
||||
}
|
||||
|
||||
int32_t WebRtcSpl_SqrtFloor(int32_t value)
|
||||
{
|
||||
int32_t root = 0, try1;
|
||||
|
||||
WEBRTC_SPL_SQRT_ITER (15);
|
||||
WEBRTC_SPL_SQRT_ITER (14);
|
||||
WEBRTC_SPL_SQRT_ITER (13);
|
||||
WEBRTC_SPL_SQRT_ITER (12);
|
||||
WEBRTC_SPL_SQRT_ITER (11);
|
||||
WEBRTC_SPL_SQRT_ITER (10);
|
||||
WEBRTC_SPL_SQRT_ITER ( 9);
|
||||
WEBRTC_SPL_SQRT_ITER ( 8);
|
||||
WEBRTC_SPL_SQRT_ITER ( 7);
|
||||
WEBRTC_SPL_SQRT_ITER ( 6);
|
||||
WEBRTC_SPL_SQRT_ITER ( 5);
|
||||
WEBRTC_SPL_SQRT_ITER ( 4);
|
||||
WEBRTC_SPL_SQRT_ITER ( 3);
|
||||
WEBRTC_SPL_SQRT_ITER ( 2);
|
||||
WEBRTC_SPL_SQRT_ITER ( 1);
|
||||
WEBRTC_SPL_SQRT_ITER ( 0);
|
||||
|
||||
return root >> 1;
|
||||
}
|
29
webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h
vendored
Normal file
29
webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
//
|
||||
// WebRtcSpl_SqrtFloor(...)
|
||||
//
|
||||
// Returns the square root of the input value |value|. The precision of this
|
||||
// function is rounding down integer precision, i.e., sqrt(8) gives 2 as answer.
|
||||
// If |value| is a negative number then 0 is returned.
|
||||
//
|
||||
// Algorithm:
|
||||
//
|
||||
// An iterative 4 cylce/bit routine
|
||||
//
|
||||
// Input:
|
||||
// - value : Value to calculate sqrt of
|
||||
//
|
||||
// Return value : Result of the sqrt calculation
|
||||
//
|
||||
int32_t WebRtcSpl_SqrtFloor(int32_t value);
|
110
webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor_arm.S
vendored
Normal file
110
webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor_arm.S
vendored
Normal file
@ -0,0 +1,110 @@
|
||||
@
|
||||
@ Written by Wilco Dijkstra, 1996. The following email exchange establishes the
|
||||
@ license.
|
||||
@
|
||||
@ From: Wilco Dijkstra <Wilco.Dijkstra@ntlworld.com>
|
||||
@ Date: Fri, Jun 24, 2011 at 3:20 AM
|
||||
@ Subject: Re: sqrt routine
|
||||
@ To: Kevin Ma <kma@google.com>
|
||||
@ Hi Kevin,
|
||||
@ Thanks for asking. Those routines are public domain (originally posted to
|
||||
@ comp.sys.arm a long time ago), so you can use them freely for any purpose.
|
||||
@ Cheers,
|
||||
@ Wilco
|
||||
@
|
||||
@ ----- Original Message -----
|
||||
@ From: "Kevin Ma" <kma@google.com>
|
||||
@ To: <Wilco.Dijkstra@ntlworld.com>
|
||||
@ Sent: Thursday, June 23, 2011 11:44 PM
|
||||
@ Subject: Fwd: sqrt routine
|
||||
@ Hi Wilco,
|
||||
@ I saw your sqrt routine from several web sites, including
|
||||
@ http://www.finesse.demon.co.uk/steven/sqrt.html.
|
||||
@ Just wonder if there's any copyright information with your Successive
|
||||
@ approximation routines, or if I can freely use it for any purpose.
|
||||
@ Thanks.
|
||||
@ Kevin
|
||||
|
||||
@ Minor modifications in code style for WebRTC, 2012.
|
||||
@ Output is bit-exact with the reference C code in spl_sqrt_floor.c.
|
||||
|
||||
@ Input : r0 32 bit unsigned integer
|
||||
@ Output: r0 = INT (SQRT (r0)), precision is 16 bits
|
||||
@ Registers touched: r1, r2
|
||||
|
||||
#include "rtc_base/system/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcSpl_SqrtFloor
|
||||
.align 2
|
||||
DEFINE_FUNCTION WebRtcSpl_SqrtFloor
|
||||
mov r1, #3 << 30
|
||||
mov r2, #1 << 30
|
||||
|
||||
@ unroll for i = 0 .. 15
|
||||
|
||||
cmp r0, r2, ror #2 * 0
|
||||
subhs r0, r0, r2, ror #2 * 0
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 1
|
||||
subhs r0, r0, r2, ror #2 * 1
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 2
|
||||
subhs r0, r0, r2, ror #2 * 2
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 3
|
||||
subhs r0, r0, r2, ror #2 * 3
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 4
|
||||
subhs r0, r0, r2, ror #2 * 4
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 5
|
||||
subhs r0, r0, r2, ror #2 * 5
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 6
|
||||
subhs r0, r0, r2, ror #2 * 6
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 7
|
||||
subhs r0, r0, r2, ror #2 * 7
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 8
|
||||
subhs r0, r0, r2, ror #2 * 8
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 9
|
||||
subhs r0, r0, r2, ror #2 * 9
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 10
|
||||
subhs r0, r0, r2, ror #2 * 10
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 11
|
||||
subhs r0, r0, r2, ror #2 * 11
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 12
|
||||
subhs r0, r0, r2, ror #2 * 12
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 13
|
||||
subhs r0, r0, r2, ror #2 * 13
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 14
|
||||
subhs r0, r0, r2, ror #2 * 14
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
cmp r0, r2, ror #2 * 15
|
||||
subhs r0, r0, r2, ror #2 * 15
|
||||
adc r2, r1, r2, lsl #1
|
||||
|
||||
bic r0, r2, #3 << 30 @ for rounding add: cmp r0, r2 adc r2, #1
|
||||
bx lr
|
207
webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor_mips.c
vendored
Normal file
207
webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor_mips.c
vendored
Normal file
@ -0,0 +1,207 @@
|
||||
/*
|
||||
* Written by Wilco Dijkstra, 1996. The following email exchange establishes the
|
||||
* license.
|
||||
*
|
||||
* From: Wilco Dijkstra <Wilco.Dijkstra@ntlworld.com>
|
||||
* Date: Fri, Jun 24, 2011 at 3:20 AM
|
||||
* Subject: Re: sqrt routine
|
||||
* To: Kevin Ma <kma@google.com>
|
||||
* Hi Kevin,
|
||||
* Thanks for asking. Those routines are public domain (originally posted to
|
||||
* comp.sys.arm a long time ago), so you can use them freely for any purpose.
|
||||
* Cheers,
|
||||
* Wilco
|
||||
*
|
||||
* ----- Original Message -----
|
||||
* From: "Kevin Ma" <kma@google.com>
|
||||
* To: <Wilco.Dijkstra@ntlworld.com>
|
||||
* Sent: Thursday, June 23, 2011 11:44 PM
|
||||
* Subject: Fwd: sqrt routine
|
||||
* Hi Wilco,
|
||||
* I saw your sqrt routine from several web sites, including
|
||||
* http://www.finesse.demon.co.uk/steven/sqrt.html.
|
||||
* Just wonder if there's any copyright information with your Successive
|
||||
* approximation routines, or if I can freely use it for any purpose.
|
||||
* Thanks.
|
||||
* Kevin
|
||||
*/
|
||||
|
||||
// Minor modifications in code style for WebRTC, 2012.
|
||||
// Code optimizations for MIPS, 2013.
|
||||
|
||||
#include "common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h"
|
||||
|
||||
/*
|
||||
* Algorithm:
|
||||
* Successive approximation of the equation (root + delta) ^ 2 = N
|
||||
* until delta < 1. If delta < 1 we have the integer part of SQRT (N).
|
||||
* Use delta = 2^i for i = 15 .. 0.
|
||||
*
|
||||
* Output precision is 16 bits. Note for large input values (close to
|
||||
* 0x7FFFFFFF), bit 15 (the highest bit of the low 16-bit half word)
|
||||
* contains the MSB information (a non-sign value). Do with caution
|
||||
* if you need to cast the output to int16_t type.
|
||||
*
|
||||
* If the input value is negative, it returns 0.
|
||||
*/
|
||||
|
||||
|
||||
int32_t WebRtcSpl_SqrtFloor(int32_t value)
|
||||
{
|
||||
int32_t root = 0, tmp1, tmp2, tmp3, tmp4;
|
||||
|
||||
__asm __volatile(
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
|
||||
"lui %[tmp1], 0x4000 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"sub %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"lui %[tmp1], 0x1 \n\t"
|
||||
"or %[tmp4], %[root], %[tmp1] \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x4000 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 14 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x8000 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x2000 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 13 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x4000 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x1000 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 12 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x2000 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x800 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 11 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x1000 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x400 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 10 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x800 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x200 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 9 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x400 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x100 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 8 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x200 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x80 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 7 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x100 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x40 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 6 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x80 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x20 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 5 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x40 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x10 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 4 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x20 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x8 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 3 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x10 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x4 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 2 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x8 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x2 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"sll %[tmp1], 1 \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"subu %[tmp3], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x4 \n\t"
|
||||
"movz %[value], %[tmp3], %[tmp2] \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
"addiu %[tmp1], $0, 0x1 \n\t"
|
||||
"addu %[tmp1], %[tmp1], %[root] \n\t"
|
||||
"slt %[tmp2], %[value], %[tmp1] \n\t"
|
||||
"ori %[tmp4], %[root], 0x2 \n\t"
|
||||
"movz %[root], %[tmp4], %[tmp2] \n\t"
|
||||
|
||||
".set pop \n\t"
|
||||
|
||||
: [root] "+r" (root), [value] "+r" (value),
|
||||
[tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
|
||||
[tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4)
|
||||
:
|
||||
);
|
||||
|
||||
return root >> 1;
|
||||
}
|
||||
|
Reference in New Issue
Block a user