From 14f0afa2d8295af6f746d9edd3264a43d2c04623 Mon Sep 17 00:00:00 2001 From: Mark Reid Date: Wed, 13 Sep 2023 18:53:37 -0700 Subject: [PATCH] Add some small arm neon optimizations (#1847) * Remove unused includes Signed-off-by: Mark Reid * Use neon hardware support for f16 conversions Signed-off-by: Mark Reid * Add some small neon optimizations use blendv,floor and fma intrinsics were possible Signed-off-by: Mark Reid --------- Signed-off-by: Mark Reid --- src/OpenColorIO/AVX.h | 1 - src/OpenColorIO/AVX2.h | 1 - src/OpenColorIO/CPUInfoConfig.h.in | 2 + src/OpenColorIO/SSE2.h | 40 ++++++++++++++++--- src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp | 10 +++++ src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp | 14 +++++++ 6 files changed, 61 insertions(+), 7 deletions(-) diff --git a/src/OpenColorIO/AVX.h b/src/OpenColorIO/AVX.h index 6cb2ea588..b4184f89d 100644 --- a/src/OpenColorIO/AVX.h +++ b/src/OpenColorIO/AVX.h @@ -9,7 +9,6 @@ #if OCIO_USE_AVX #include -#include #include #include "BitDepthUtils.h" diff --git a/src/OpenColorIO/AVX2.h b/src/OpenColorIO/AVX2.h index 3237533bc..85bf48dc8 100644 --- a/src/OpenColorIO/AVX2.h +++ b/src/OpenColorIO/AVX2.h @@ -9,7 +9,6 @@ #if OCIO_USE_AVX2 #include -#include #include #include "BitDepthUtils.h" diff --git a/src/OpenColorIO/CPUInfoConfig.h.in b/src/OpenColorIO/CPUInfoConfig.h.in index 472af56a4..b8f5045d2 100644 --- a/src/OpenColorIO/CPUInfoConfig.h.in +++ b/src/OpenColorIO/CPUInfoConfig.h.in @@ -8,6 +8,8 @@ // Relevant only for arm64 architecture. #if defined(__aarch64__) #cmakedefine01 OCIO_USE_SSE2NEON +#else + #define OCIO_USE_SSE2NEON 0 #endif // On the Apple platform, a universal build is created for both x86_64 and arm64 architectures. diff --git a/src/OpenColorIO/SSE2.h b/src/OpenColorIO/SSE2.h index 2527ff084..918694fc8 100644 --- a/src/OpenColorIO/SSE2.h +++ b/src/OpenColorIO/SSE2.h @@ -18,8 +18,6 @@ #endif #endif -#include - #include #include "BitDepthUtils.h" @@ -76,6 +74,8 @@ static inline void sse2RGBATranspose_4x4(__m128 row0, __m128 row1, __m128 row2, out_a = _mm_movehl_ps(tmp3, tmp1); } +#if !OCIO_USE_SSE2NEON + static inline __m128i sse2_blendv(__m128i a, __m128i b, __m128i mask) { return _mm_xor_si128(_mm_and_si128(_mm_xor_si128(a, b), mask), a); @@ -164,6 +164,8 @@ static inline __m128 sse2_cvtph_ps(__m128i a) return _mm_or_ps(o, sign); } +#endif + // Note Packing functions perform no 0.0 - 1.0 normalization // but perform 0 - max value clamping for integer formats template struct SSE2RGBAPack {}; @@ -290,21 +292,48 @@ struct SSE2RGBAPack __m128i rgba_00_01 = _mm_loadu_si128((const __m128i*)(in + 0)); __m128i rgba_02_03 = _mm_loadu_si128((const __m128i*)(in + 8)); +#if OCIO_USE_SSE2NEON + // use neon hardware support for f16 to f32 + __m128 rgba0 = vreinterpretq_m128_f32( + vcvt_f32_f16(vget_low_f16(vreinterpretq_f16_s64(vreinterpretq_s64_m128i(rgba_00_01)))) + ); + __m128 rgba1 = vreinterpretq_m128_f32( + vcvt_f32_f16(vget_high_f16(vreinterpretq_f16_s64(vreinterpretq_s64_m128i(rgba_00_01)))) + ); + __m128 rgba2 = vreinterpretq_m128_f32( + vcvt_f32_f16(vget_low_f16(vreinterpretq_f16_s64(vreinterpretq_s64_m128i(rgba_02_03)))) + ); + __m128 rgba3 = vreinterpretq_m128_f32( + vcvt_f32_f16(vget_high_f16(vreinterpretq_f16_s64(vreinterpretq_s64_m128i(rgba_02_03)))) + ); +#else __m128 rgba0 = sse2_cvtph_ps(rgba_00_01); __m128 rgba1 = sse2_cvtph_ps(_mm_shuffle_epi32(rgba_00_01, _MM_SHUFFLE(1,0,3,2))); __m128 rgba2 = sse2_cvtph_ps(rgba_02_03); __m128 rgba3 = sse2_cvtph_ps(_mm_shuffle_epi32(rgba_02_03, _MM_SHUFFLE(1,0,3,2))); - +#endif sse2RGBATranspose_4x4(rgba0, rgba1, rgba2, rgba3, r, g, b, a); } static inline void Store(half *out, __m128 r, __m128 g, __m128 b, __m128 a) { __m128 rgba0, rgba1, rgba2, rgba3; - __m128i rgba; - sse2RGBATranspose_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3); +#if OCIO_USE_SSE2NEON + // use neon hardware support for f32 to f16 + float16x8_t rgba; + float16x4_t rgba00_01 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba0)); + float16x4_t rgba03_03 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba1)); + float16x4_t rgba04_05 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba2)); + float16x4_t rgba06_07 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba3)); + rgba = vcombine_f16(rgba00_01, rgba03_03); + vst1q_f16((float16_t *)(out+0), rgba); + + rgba = vcombine_f16(rgba04_05, rgba06_07); + vst1q_f16((float16_t *)(out+8), rgba); +#else + __m128i rgba; __m128i rgba00_01 = sse2_cvtps_ph(rgba0); __m128i rgba02_03 = sse2_cvtps_ph(rgba1); __m128i rgba04_05 = sse2_cvtps_ph(rgba2); @@ -315,6 +344,7 @@ struct SSE2RGBAPack rgba = _mm_xor_si128(rgba04_05, _mm_shuffle_epi32(rgba06_07, _MM_SHUFFLE(1,0,3,2))); _mm_storeu_si128((__m128i*)(out+8), rgba); +#endif } }; diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp index d7ecfe49c..c170e1791 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp @@ -24,13 +24,23 @@ namespace { static inline __m128 fmadd_ps_sse2(__m128 a, __m128 b, __m128 c) { +#if OCIO_USE_SSE2NEON + return vreinterpretq_m128_f32( + vfmaq_f32(vreinterpretq_f32_m128(c), vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)) + ); +#else return _mm_add_ps(_mm_mul_ps(a, b), c); +#endif } static inline __m128 floor_ps_sse2(__m128 v) { +#if OCIO_USE_SSE2NEON + return _mm_floor_ps(v); +#else // NOTE: using truncate cvtt return _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); +#endif } diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp index e98a0470d..e7b338d36 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp @@ -38,18 +38,32 @@ struct rgbavec_sse2 { static inline __m128 floor_ps_sse2(__m128 v) { +#if OCIO_USE_SSE2NEON + return _mm_floor_ps(v); +#else // NOTE: using truncate cvtt return _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); +#endif } static inline __m128 blendv_ps_sse2(__m128 a, __m128 b, __m128 mask) { +#if OCIO_USE_SSE2NEON + return _mm_blendv_ps(a, b, mask); +#else return _mm_xor_ps(_mm_and_ps(_mm_xor_ps(a, b), mask), a); +#endif } static inline __m128 fmadd_ps_sse2(__m128 a, __m128 b, __m128 c) { +#if OCIO_USE_SSE2NEON + return vreinterpretq_m128_f32( + vfmaq_f32(vreinterpretq_f32_m128(c), vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)) + ); +#else return _mm_add_ps(_mm_mul_ps(a, b), c); +#endif } static inline rgbavec_sse2 interp_tetrahedral_sse2(const Lut3DContextSSE2 &ctx, __m128 r, __m128 g, __m128 b, __m128 a)