Allow disabling inline SSE
Should make building on i686 without SSE feasible. Fixes: https://gitlab.freedesktop.org/pulseaudio/webrtc-audio-processing/-/issues/5
This commit is contained in:
parent
c144c53039
commit
fed81a77c9
14
meson.build
14
meson.build
@ -110,6 +110,7 @@ have_neon = false
|
|||||||
have_mips = false
|
have_mips = false
|
||||||
have_mips64 = false
|
have_mips64 = false
|
||||||
have_x86 = false
|
have_x86 = false
|
||||||
|
have_inline_sse = false
|
||||||
have_avx2 = false
|
have_avx2 = false
|
||||||
if host_machine.cpu_family() == 'arm'
|
if host_machine.cpu_family() == 'arm'
|
||||||
if cc.compiles('''#ifndef __ARM_ARCH_ISA_ARM
|
if cc.compiles('''#ifndef __ARM_ARCH_ISA_ARM
|
||||||
@ -140,10 +141,19 @@ if host_machine.cpu_family() == 'mips64'
|
|||||||
endif
|
endif
|
||||||
if ['x86', 'x86_64'].contains(host_machine.cpu_family())
|
if ['x86', 'x86_64'].contains(host_machine.cpu_family())
|
||||||
have_x86 = true
|
have_x86 = true
|
||||||
# This is unconditionally enabled for now, actual usage is determined by
|
# AVX2 support is unconditionally available, since all the code (compiled
|
||||||
# runtime CPU detection, so we're just assuming the compiler supports avx2
|
# with -mavx2) is in separate files from runtime detection (which should not
|
||||||
|
# be compiled with SIMD flags for cases where the CPU does not support it).
|
||||||
|
# Unfortunately, a bunch of SSE code is inline with the runtime detection,
|
||||||
|
# and we can't support that on systems that don't support SSE.
|
||||||
have_avx2 = true
|
have_avx2 = true
|
||||||
arch_cflags += ['-DWEBRTC_ENABLE_AVX2']
|
arch_cflags += ['-DWEBRTC_ENABLE_AVX2']
|
||||||
|
if get_option('inline-sse')
|
||||||
|
have_inline_sse = true
|
||||||
|
else
|
||||||
|
have_inline_sse = false
|
||||||
|
arch_cflags += ['-DWAP_DISABLE_INLINE_SSE']
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
neon_opt = get_option('neon')
|
neon_opt = get_option('neon')
|
||||||
|
@ -3,4 +3,7 @@ option('gnustl', type: 'feature',
|
|||||||
description: 'Use gnustl for a c++ library implementation (only used on Android)')
|
description: 'Use gnustl for a c++ library implementation (only used on Android)')
|
||||||
option('neon', type: 'combo',
|
option('neon', type: 'combo',
|
||||||
choices: ['no', 'yes', 'auto', 'runtime'],
|
choices: ['no', 'yes', 'auto', 'runtime'],
|
||||||
description: '')
|
description: 'Enable NEON optimisations')
|
||||||
|
option('inline-sse', type: 'boolean',
|
||||||
|
value: true,
|
||||||
|
description: 'Enable inline SSE/SSE2 optimisations (i.e. assume CPU supports SSE/SSE2)')
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
#if defined(WEBRTC_HAS_NEON)
|
#if defined(WEBRTC_HAS_NEON)
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#endif
|
#endif
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
@ -88,7 +88,7 @@ void ComputeFrequencyResponse_Neon(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
// Computes and stores the frequency response of the filter.
|
// Computes and stores the frequency response of the filter.
|
||||||
void ComputeFrequencyResponse_Sse2(
|
void ComputeFrequencyResponse_Sse2(
|
||||||
size_t num_partitions,
|
size_t num_partitions,
|
||||||
@ -212,7 +212,7 @@ void AdaptPartitions_Neon(const RenderBuffer& render_buffer,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
// Adapts the filter partitions. (SSE2 variant)
|
// Adapts the filter partitions. (SSE2 variant)
|
||||||
void AdaptPartitions_Sse2(const RenderBuffer& render_buffer,
|
void AdaptPartitions_Sse2(const RenderBuffer& render_buffer,
|
||||||
const FftData& G,
|
const FftData& G,
|
||||||
@ -377,7 +377,7 @@ void ApplyFilter_Neon(const RenderBuffer& render_buffer,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
// Produces the filter output (SSE2 variant).
|
// Produces the filter output (SSE2 variant).
|
||||||
void ApplyFilter_Sse2(const RenderBuffer& render_buffer,
|
void ApplyFilter_Sse2(const RenderBuffer& render_buffer,
|
||||||
size_t num_partitions,
|
size_t num_partitions,
|
||||||
@ -557,9 +557,11 @@ void AdaptiveFirFilter::Filter(const RenderBuffer& render_buffer,
|
|||||||
RTC_DCHECK(S);
|
RTC_DCHECK(S);
|
||||||
switch (optimization_) {
|
switch (optimization_) {
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||||
|
#if !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
case Aec3Optimization::kSse2:
|
case Aec3Optimization::kSse2:
|
||||||
aec3::ApplyFilter_Sse2(render_buffer, current_size_partitions_, H_, S);
|
aec3::ApplyFilter_Sse2(render_buffer, current_size_partitions_, H_, S);
|
||||||
break;
|
break;
|
||||||
|
#endif
|
||||||
case Aec3Optimization::kAvx2:
|
case Aec3Optimization::kAvx2:
|
||||||
aec3::ApplyFilter_Avx2(render_buffer, current_size_partitions_, H_, S);
|
aec3::ApplyFilter_Avx2(render_buffer, current_size_partitions_, H_, S);
|
||||||
break;
|
break;
|
||||||
@ -601,9 +603,11 @@ void AdaptiveFirFilter::ComputeFrequencyResponse(
|
|||||||
|
|
||||||
switch (optimization_) {
|
switch (optimization_) {
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||||
|
#if !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
case Aec3Optimization::kSse2:
|
case Aec3Optimization::kSse2:
|
||||||
aec3::ComputeFrequencyResponse_Sse2(current_size_partitions_, H_, H2);
|
aec3::ComputeFrequencyResponse_Sse2(current_size_partitions_, H_, H2);
|
||||||
break;
|
break;
|
||||||
|
#endif
|
||||||
case Aec3Optimization::kAvx2:
|
case Aec3Optimization::kAvx2:
|
||||||
aec3::ComputeFrequencyResponse_Avx2(current_size_partitions_, H_, H2);
|
aec3::ComputeFrequencyResponse_Avx2(current_size_partitions_, H_, H2);
|
||||||
break;
|
break;
|
||||||
@ -626,10 +630,12 @@ void AdaptiveFirFilter::AdaptAndUpdateSize(const RenderBuffer& render_buffer,
|
|||||||
// Adapt the filter.
|
// Adapt the filter.
|
||||||
switch (optimization_) {
|
switch (optimization_) {
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||||
|
#if !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
case Aec3Optimization::kSse2:
|
case Aec3Optimization::kSse2:
|
||||||
aec3::AdaptPartitions_Sse2(render_buffer, G, current_size_partitions_,
|
aec3::AdaptPartitions_Sse2(render_buffer, G, current_size_partitions_,
|
||||||
&H_);
|
&H_);
|
||||||
break;
|
break;
|
||||||
|
#endif
|
||||||
case Aec3Optimization::kAvx2:
|
case Aec3Optimization::kAvx2:
|
||||||
aec3::AdaptPartitions_Avx2(render_buffer, G, current_size_partitions_,
|
aec3::AdaptPartitions_Avx2(render_buffer, G, current_size_partitions_,
|
||||||
&H_);
|
&H_);
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
#if defined(WEBRTC_HAS_NEON)
|
#if defined(WEBRTC_HAS_NEON)
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#endif
|
#endif
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -54,7 +54,7 @@ void ErlComputer_NEON(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
// Computes and stores the echo return loss estimate of the filter, which is the
|
// Computes and stores the echo return loss estimate of the filter, which is the
|
||||||
// sum of the partition frequency responses.
|
// sum of the partition frequency responses.
|
||||||
void ErlComputer_SSE2(
|
void ErlComputer_SSE2(
|
||||||
@ -82,9 +82,11 @@ void ComputeErl(const Aec3Optimization& optimization,
|
|||||||
// Update the frequency response and echo return loss for the filter.
|
// Update the frequency response and echo return loss for the filter.
|
||||||
switch (optimization) {
|
switch (optimization) {
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||||
|
#if !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
case Aec3Optimization::kSse2:
|
case Aec3Optimization::kSse2:
|
||||||
aec3::ErlComputer_SSE2(H2, erl);
|
aec3::ErlComputer_SSE2(H2, erl);
|
||||||
break;
|
break;
|
||||||
|
#endif
|
||||||
case Aec3Optimization::kAvx2:
|
case Aec3Optimization::kAvx2:
|
||||||
aec3::ErlComputer_AVX2(H2, erl);
|
aec3::ErlComputer_AVX2(H2, erl);
|
||||||
break;
|
break;
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
// Defines WEBRTC_ARCH_X86_FAMILY, used below.
|
// Defines WEBRTC_ARCH_X86_FAMILY, used below.
|
||||||
#include "rtc_base/system/arch.h"
|
#include "rtc_base/system/arch.h"
|
||||||
|
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -49,6 +49,7 @@ struct FftData {
|
|||||||
RTC_DCHECK_EQ(kFftLengthBy2Plus1, power_spectrum.size());
|
RTC_DCHECK_EQ(kFftLengthBy2Plus1, power_spectrum.size());
|
||||||
switch (optimization) {
|
switch (optimization) {
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||||
|
#if !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
case Aec3Optimization::kSse2: {
|
case Aec3Optimization::kSse2: {
|
||||||
constexpr int kNumFourBinBands = kFftLengthBy2 / 4;
|
constexpr int kNumFourBinBands = kFftLengthBy2 / 4;
|
||||||
constexpr int kLimit = kNumFourBinBands * 4;
|
constexpr int kLimit = kNumFourBinBands * 4;
|
||||||
@ -63,6 +64,7 @@ struct FftData {
|
|||||||
power_spectrum[kFftLengthBy2] = re[kFftLengthBy2] * re[kFftLengthBy2] +
|
power_spectrum[kFftLengthBy2] = re[kFftLengthBy2] * re[kFftLengthBy2] +
|
||||||
im[kFftLengthBy2] * im[kFftLengthBy2];
|
im[kFftLengthBy2] * im[kFftLengthBy2];
|
||||||
} break;
|
} break;
|
||||||
|
#endif
|
||||||
case Aec3Optimization::kAvx2:
|
case Aec3Optimization::kAvx2:
|
||||||
SpectrumAVX2(power_spectrum);
|
SpectrumAVX2(power_spectrum);
|
||||||
break;
|
break;
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
#if defined(WEBRTC_HAS_NEON)
|
#if defined(WEBRTC_HAS_NEON)
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#endif
|
#endif
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -286,7 +286,7 @@ void MatchedFilterCore_NEON(size_t x_start_index,
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
|
|
||||||
void MatchedFilterCore_AccumulatedError_SSE2(
|
void MatchedFilterCore_AccumulatedError_SSE2(
|
||||||
size_t x_start_index,
|
size_t x_start_index,
|
||||||
@ -695,12 +695,14 @@ void MatchedFilter::Update(const DownsampledRenderBuffer& render_buffer,
|
|||||||
|
|
||||||
switch (optimization_) {
|
switch (optimization_) {
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||||
|
#if !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
case Aec3Optimization::kSse2:
|
case Aec3Optimization::kSse2:
|
||||||
aec3::MatchedFilterCore_SSE2(
|
aec3::MatchedFilterCore_SSE2(
|
||||||
x_start_index, x2_sum_threshold, smoothing, render_buffer.buffer, y,
|
x_start_index, x2_sum_threshold, smoothing, render_buffer.buffer, y,
|
||||||
filters_[n], &filters_updated, &error_sum, compute_pre_echo,
|
filters_[n], &filters_updated, &error_sum, compute_pre_echo,
|
||||||
instantaneous_accumulated_error_, scratch_memory_);
|
instantaneous_accumulated_error_, scratch_memory_);
|
||||||
break;
|
break;
|
||||||
|
#endif
|
||||||
case Aec3Optimization::kAvx2:
|
case Aec3Optimization::kAvx2:
|
||||||
aec3::MatchedFilterCore_AVX2(
|
aec3::MatchedFilterCore_AVX2(
|
||||||
x_start_index, x2_sum_threshold, smoothing, render_buffer.buffer, y,
|
x_start_index, x2_sum_threshold, smoothing, render_buffer.buffer, y,
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
#if defined(WEBRTC_HAS_NEON)
|
#if defined(WEBRTC_HAS_NEON)
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#endif
|
#endif
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
@ -43,7 +43,7 @@ class VectorMath {
|
|||||||
void SqrtAVX2(rtc::ArrayView<float> x);
|
void SqrtAVX2(rtc::ArrayView<float> x);
|
||||||
void Sqrt(rtc::ArrayView<float> x) {
|
void Sqrt(rtc::ArrayView<float> x) {
|
||||||
switch (optimization_) {
|
switch (optimization_) {
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
case Aec3Optimization::kSse2: {
|
case Aec3Optimization::kSse2: {
|
||||||
const int x_size = static_cast<int>(x.size());
|
const int x_size = static_cast<int>(x.size());
|
||||||
const int vector_limit = x_size >> 2;
|
const int vector_limit = x_size >> 2;
|
||||||
@ -123,7 +123,7 @@ class VectorMath {
|
|||||||
RTC_DCHECK_EQ(z.size(), x.size());
|
RTC_DCHECK_EQ(z.size(), x.size());
|
||||||
RTC_DCHECK_EQ(z.size(), y.size());
|
RTC_DCHECK_EQ(z.size(), y.size());
|
||||||
switch (optimization_) {
|
switch (optimization_) {
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
case Aec3Optimization::kSse2: {
|
case Aec3Optimization::kSse2: {
|
||||||
const int x_size = static_cast<int>(x.size());
|
const int x_size = static_cast<int>(x.size());
|
||||||
const int vector_limit = x_size >> 2;
|
const int vector_limit = x_size >> 2;
|
||||||
@ -174,6 +174,7 @@ class VectorMath {
|
|||||||
RTC_DCHECK_EQ(z.size(), x.size());
|
RTC_DCHECK_EQ(z.size(), x.size());
|
||||||
switch (optimization_) {
|
switch (optimization_) {
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||||
|
#if !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
case Aec3Optimization::kSse2: {
|
case Aec3Optimization::kSse2: {
|
||||||
const int x_size = static_cast<int>(x.size());
|
const int x_size = static_cast<int>(x.size());
|
||||||
const int vector_limit = x_size >> 2;
|
const int vector_limit = x_size >> 2;
|
||||||
@ -190,6 +191,7 @@ class VectorMath {
|
|||||||
z[j] += x[j];
|
z[j] += x[j];
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
#endif
|
||||||
case Aec3Optimization::kAvx2:
|
case Aec3Optimization::kAvx2:
|
||||||
AccumulateAVX2(x, z);
|
AccumulateAVX2(x, z);
|
||||||
break;
|
break;
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
#if defined(WEBRTC_HAS_NEON)
|
#if defined(WEBRTC_HAS_NEON)
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#endif
|
#endif
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -47,6 +47,7 @@ class VectorMath {
|
|||||||
if (cpu_features_.avx2) {
|
if (cpu_features_.avx2) {
|
||||||
return DotProductAvx2(x, y);
|
return DotProductAvx2(x, y);
|
||||||
} else if (cpu_features_.sse2) {
|
} else if (cpu_features_.sse2) {
|
||||||
|
#if !defined(WAP_DISABLE_INLINE_SSE)
|
||||||
__m128 accumulator = _mm_setzero_ps();
|
__m128 accumulator = _mm_setzero_ps();
|
||||||
constexpr int kBlockSizeLog2 = 2;
|
constexpr int kBlockSizeLog2 = 2;
|
||||||
constexpr int kBlockSize = 1 << kBlockSizeLog2;
|
constexpr int kBlockSize = 1 << kBlockSizeLog2;
|
||||||
@ -72,6 +73,7 @@ class VectorMath {
|
|||||||
dot_product += x[i] * y[i];
|
dot_product += x[i] * y[i];
|
||||||
}
|
}
|
||||||
return dot_product;
|
return dot_product;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#elif defined(WEBRTC_HAS_NEON) && defined(WEBRTC_ARCH_ARM64)
|
#elif defined(WEBRTC_HAS_NEON) && defined(WEBRTC_ARCH_ARM64)
|
||||||
if (cpu_features_.neon) {
|
if (cpu_features_.neon) {
|
||||||
|
2
webrtc/third_party/pffft/meson.build
vendored
2
webrtc/third_party/pffft/meson.build
vendored
@ -4,7 +4,7 @@ pffft_sources = [
|
|||||||
|
|
||||||
pffft_cflags = [ '-D_GNU_SOURCE' ]
|
pffft_cflags = [ '-D_GNU_SOURCE' ]
|
||||||
|
|
||||||
if (have_arm and not have_neon) or (have_mips and host_machine.endian() == 'little') or have_mips64
|
if not have_inline_sse or (have_arm and not have_neon) or (have_mips and host_machine.endian() == 'little') or have_mips64
|
||||||
pffft_cflags += [ '-DPFFFT_SIMD_DISABLE' ]
|
pffft_cflags += [ '-DPFFFT_SIMD_DISABLE' ]
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user