From fed81a77c9a9bc366556f732324cdc5f9e7b09e9 Mon Sep 17 00:00:00 2001 From: Arun Raghavan Date: Thu, 26 Dec 2024 14:24:40 -0500 Subject: [PATCH] Allow disabling inline SSE Should make building on i686 without SSE feasible. Fixes: https://gitlab.freedesktop.org/pulseaudio/webrtc-audio-processing/-/issues/5 --- meson.build | 14 ++++++++++++-- meson_options.txt | 5 ++++- .../audio_processing/aec3/adaptive_fir_filter.cc | 14 ++++++++++---- .../aec3/adaptive_fir_filter_erl.cc | 6 ++++-- webrtc/modules/audio_processing/aec3/fft_data.h | 4 +++- .../audio_processing/aec3/matched_filter.cc | 6 ++++-- webrtc/modules/audio_processing/aec3/vector_math.h | 8 +++++--- .../audio_processing/agc2/rnn_vad/vector_math.h | 4 +++- webrtc/third_party/pffft/meson.build | 2 +- 9 files changed, 46 insertions(+), 17 deletions(-) diff --git a/meson.build b/meson.build index 811d795..ebf053a 100644 --- a/meson.build +++ b/meson.build @@ -110,6 +110,7 @@ have_neon = false have_mips = false have_mips64 = false have_x86 = false +have_inline_sse = false have_avx2 = false if host_machine.cpu_family() == 'arm' if cc.compiles('''#ifndef __ARM_ARCH_ISA_ARM @@ -140,10 +141,19 @@ if host_machine.cpu_family() == 'mips64' endif if ['x86', 'x86_64'].contains(host_machine.cpu_family()) have_x86 = true - # This is unconditionally enabled for now, actual usage is determined by - # runtime CPU detection, so we're just assuming the compiler supports avx2 + # AVX2 support is unconditionally available, since all the code (compiled + # with -mavx2) is in separate files from runtime detection (which should not + # be compiled with SIMD flags for cases where the CPU does not support it). + # Unfortunately, a bunch of SSE code is inline with the runtime detection, + # and we can't support that on systems that don't support SSE. have_avx2 = true arch_cflags += ['-DWEBRTC_ENABLE_AVX2'] + if get_option('inline-sse') + have_inline_sse = true + else + have_inline_sse = false + arch_cflags += ['-DWAP_DISABLE_INLINE_SSE'] + endif endif neon_opt = get_option('neon') diff --git a/meson_options.txt b/meson_options.txt index c939fb9..d08f356 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -3,4 +3,7 @@ option('gnustl', type: 'feature', description: 'Use gnustl for a c++ library implementation (only used on Android)') option('neon', type: 'combo', choices: ['no', 'yes', 'auto', 'runtime'], - description: '') + description: 'Enable NEON optimisations') +option('inline-sse', type: 'boolean', + value: true, + description: 'Enable inline SSE/SSE2 optimisations (i.e. assume CPU supports SSE/SSE2)') diff --git a/webrtc/modules/audio_processing/aec3/adaptive_fir_filter.cc b/webrtc/modules/audio_processing/aec3/adaptive_fir_filter.cc index 917aa95..ded0511 100644 --- a/webrtc/modules/audio_processing/aec3/adaptive_fir_filter.cc +++ b/webrtc/modules/audio_processing/aec3/adaptive_fir_filter.cc @@ -16,7 +16,7 @@ #if defined(WEBRTC_HAS_NEON) #include #endif -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) #include #endif #include @@ -88,7 +88,7 @@ void ComputeFrequencyResponse_Neon( } #endif -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) // Computes and stores the frequency response of the filter. void ComputeFrequencyResponse_Sse2( size_t num_partitions, @@ -212,7 +212,7 @@ void AdaptPartitions_Neon(const RenderBuffer& render_buffer, } #endif -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) // Adapts the filter partitions. (SSE2 variant) void AdaptPartitions_Sse2(const RenderBuffer& render_buffer, const FftData& G, @@ -377,7 +377,7 @@ void ApplyFilter_Neon(const RenderBuffer& render_buffer, } #endif -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) // Produces the filter output (SSE2 variant). void ApplyFilter_Sse2(const RenderBuffer& render_buffer, size_t num_partitions, @@ -557,9 +557,11 @@ void AdaptiveFirFilter::Filter(const RenderBuffer& render_buffer, RTC_DCHECK(S); switch (optimization_) { #if defined(WEBRTC_ARCH_X86_FAMILY) +#if !defined(WAP_DISABLE_INLINE_SSE) case Aec3Optimization::kSse2: aec3::ApplyFilter_Sse2(render_buffer, current_size_partitions_, H_, S); break; +#endif case Aec3Optimization::kAvx2: aec3::ApplyFilter_Avx2(render_buffer, current_size_partitions_, H_, S); break; @@ -601,9 +603,11 @@ void AdaptiveFirFilter::ComputeFrequencyResponse( switch (optimization_) { #if defined(WEBRTC_ARCH_X86_FAMILY) +#if !defined(WAP_DISABLE_INLINE_SSE) case Aec3Optimization::kSse2: aec3::ComputeFrequencyResponse_Sse2(current_size_partitions_, H_, H2); break; +#endif case Aec3Optimization::kAvx2: aec3::ComputeFrequencyResponse_Avx2(current_size_partitions_, H_, H2); break; @@ -626,10 +630,12 @@ void AdaptiveFirFilter::AdaptAndUpdateSize(const RenderBuffer& render_buffer, // Adapt the filter. switch (optimization_) { #if defined(WEBRTC_ARCH_X86_FAMILY) +#if !defined(WAP_DISABLE_INLINE_SSE) case Aec3Optimization::kSse2: aec3::AdaptPartitions_Sse2(render_buffer, G, current_size_partitions_, &H_); break; +#endif case Aec3Optimization::kAvx2: aec3::AdaptPartitions_Avx2(render_buffer, G, current_size_partitions_, &H_); diff --git a/webrtc/modules/audio_processing/aec3/adaptive_fir_filter_erl.cc b/webrtc/modules/audio_processing/aec3/adaptive_fir_filter_erl.cc index 45b8813..920d51c 100644 --- a/webrtc/modules/audio_processing/aec3/adaptive_fir_filter_erl.cc +++ b/webrtc/modules/audio_processing/aec3/adaptive_fir_filter_erl.cc @@ -16,7 +16,7 @@ #if defined(WEBRTC_HAS_NEON) #include #endif -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) #include #endif @@ -54,7 +54,7 @@ void ErlComputer_NEON( } #endif -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) // Computes and stores the echo return loss estimate of the filter, which is the // sum of the partition frequency responses. void ErlComputer_SSE2( @@ -82,9 +82,11 @@ void ComputeErl(const Aec3Optimization& optimization, // Update the frequency response and echo return loss for the filter. switch (optimization) { #if defined(WEBRTC_ARCH_X86_FAMILY) +#if !defined(WAP_DISABLE_INLINE_SSE) case Aec3Optimization::kSse2: aec3::ErlComputer_SSE2(H2, erl); break; +#endif case Aec3Optimization::kAvx2: aec3::ErlComputer_AVX2(H2, erl); break; diff --git a/webrtc/modules/audio_processing/aec3/fft_data.h b/webrtc/modules/audio_processing/aec3/fft_data.h index 9c25e78..892407d 100644 --- a/webrtc/modules/audio_processing/aec3/fft_data.h +++ b/webrtc/modules/audio_processing/aec3/fft_data.h @@ -14,7 +14,7 @@ // Defines WEBRTC_ARCH_X86_FAMILY, used below. #include "rtc_base/system/arch.h" -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) #include #endif #include @@ -49,6 +49,7 @@ struct FftData { RTC_DCHECK_EQ(kFftLengthBy2Plus1, power_spectrum.size()); switch (optimization) { #if defined(WEBRTC_ARCH_X86_FAMILY) +#if !defined(WAP_DISABLE_INLINE_SSE) case Aec3Optimization::kSse2: { constexpr int kNumFourBinBands = kFftLengthBy2 / 4; constexpr int kLimit = kNumFourBinBands * 4; @@ -63,6 +64,7 @@ struct FftData { power_spectrum[kFftLengthBy2] = re[kFftLengthBy2] * re[kFftLengthBy2] + im[kFftLengthBy2] * im[kFftLengthBy2]; } break; +#endif case Aec3Optimization::kAvx2: SpectrumAVX2(power_spectrum); break; diff --git a/webrtc/modules/audio_processing/aec3/matched_filter.cc b/webrtc/modules/audio_processing/aec3/matched_filter.cc index 59a3b46..86f365a 100644 --- a/webrtc/modules/audio_processing/aec3/matched_filter.cc +++ b/webrtc/modules/audio_processing/aec3/matched_filter.cc @@ -15,7 +15,7 @@ #if defined(WEBRTC_HAS_NEON) #include #endif -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) #include #endif #include @@ -286,7 +286,7 @@ void MatchedFilterCore_NEON(size_t x_start_index, #endif -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) void MatchedFilterCore_AccumulatedError_SSE2( size_t x_start_index, @@ -695,12 +695,14 @@ void MatchedFilter::Update(const DownsampledRenderBuffer& render_buffer, switch (optimization_) { #if defined(WEBRTC_ARCH_X86_FAMILY) +#if !defined(WAP_DISABLE_INLINE_SSE) case Aec3Optimization::kSse2: aec3::MatchedFilterCore_SSE2( x_start_index, x2_sum_threshold, smoothing, render_buffer.buffer, y, filters_[n], &filters_updated, &error_sum, compute_pre_echo, instantaneous_accumulated_error_, scratch_memory_); break; +#endif case Aec3Optimization::kAvx2: aec3::MatchedFilterCore_AVX2( x_start_index, x2_sum_threshold, smoothing, render_buffer.buffer, y, diff --git a/webrtc/modules/audio_processing/aec3/vector_math.h b/webrtc/modules/audio_processing/aec3/vector_math.h index e4d1381..1506a44 100644 --- a/webrtc/modules/audio_processing/aec3/vector_math.h +++ b/webrtc/modules/audio_processing/aec3/vector_math.h @@ -17,7 +17,7 @@ #if defined(WEBRTC_HAS_NEON) #include #endif -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) #include #endif #include @@ -43,7 +43,7 @@ class VectorMath { void SqrtAVX2(rtc::ArrayView x); void Sqrt(rtc::ArrayView x) { switch (optimization_) { -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) case Aec3Optimization::kSse2: { const int x_size = static_cast(x.size()); const int vector_limit = x_size >> 2; @@ -123,7 +123,7 @@ class VectorMath { RTC_DCHECK_EQ(z.size(), x.size()); RTC_DCHECK_EQ(z.size(), y.size()); switch (optimization_) { -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) case Aec3Optimization::kSse2: { const int x_size = static_cast(x.size()); const int vector_limit = x_size >> 2; @@ -174,6 +174,7 @@ class VectorMath { RTC_DCHECK_EQ(z.size(), x.size()); switch (optimization_) { #if defined(WEBRTC_ARCH_X86_FAMILY) +#if !defined(WAP_DISABLE_INLINE_SSE) case Aec3Optimization::kSse2: { const int x_size = static_cast(x.size()); const int vector_limit = x_size >> 2; @@ -190,6 +191,7 @@ class VectorMath { z[j] += x[j]; } } break; +#endif case Aec3Optimization::kAvx2: AccumulateAVX2(x, z); break; diff --git a/webrtc/modules/audio_processing/agc2/rnn_vad/vector_math.h b/webrtc/modules/audio_processing/agc2/rnn_vad/vector_math.h index 47f6811..f965086 100644 --- a/webrtc/modules/audio_processing/agc2/rnn_vad/vector_math.h +++ b/webrtc/modules/audio_processing/agc2/rnn_vad/vector_math.h @@ -17,7 +17,7 @@ #if defined(WEBRTC_HAS_NEON) #include #endif -#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE) #include #endif @@ -47,6 +47,7 @@ class VectorMath { if (cpu_features_.avx2) { return DotProductAvx2(x, y); } else if (cpu_features_.sse2) { +#if !defined(WAP_DISABLE_INLINE_SSE) __m128 accumulator = _mm_setzero_ps(); constexpr int kBlockSizeLog2 = 2; constexpr int kBlockSize = 1 << kBlockSizeLog2; @@ -72,6 +73,7 @@ class VectorMath { dot_product += x[i] * y[i]; } return dot_product; +#endif } #elif defined(WEBRTC_HAS_NEON) && defined(WEBRTC_ARCH_ARM64) if (cpu_features_.neon) { diff --git a/webrtc/third_party/pffft/meson.build b/webrtc/third_party/pffft/meson.build index c1eb5c6..cf4c9c7 100644 --- a/webrtc/third_party/pffft/meson.build +++ b/webrtc/third_party/pffft/meson.build @@ -4,7 +4,7 @@ pffft_sources = [ pffft_cflags = [ '-D_GNU_SOURCE' ] -if (have_arm and not have_neon) or (have_mips and host_machine.endian() == 'little') or have_mips64 +if not have_inline_sse or (have_arm and not have_neon) or (have_mips and host_machine.endian() == 'little') or have_mips64 pffft_cflags += [ '-DPFFFT_SIMD_DISABLE' ] endif