Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add some small arm neon optimizations #1847

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/OpenColorIO/AVX.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#if OCIO_USE_AVX

#include <immintrin.h>
#include <stdio.h>

#include <OpenColorIO/OpenColorIO.h>
#include "BitDepthUtils.h"
Expand Down
1 change: 0 additions & 1 deletion src/OpenColorIO/AVX2.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#if OCIO_USE_AVX2

#include <immintrin.h>
#include <stdio.h>

#include <OpenColorIO/OpenColorIO.h>
#include "BitDepthUtils.h"
Expand Down
2 changes: 2 additions & 0 deletions src/OpenColorIO/CPUInfoConfig.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
// Relevant only for arm64 architecture.
#if defined(__aarch64__)
#cmakedefine01 OCIO_USE_SSE2NEON
#else
#define OCIO_USE_SSE2NEON 0
#endif

// On the Apple platform, a universal build is created for both x86_64 and arm64 architectures.
Expand Down
40 changes: 35 additions & 5 deletions src/OpenColorIO/SSE2.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
#endif
#endif

#include <stdio.h>

#include <OpenColorIO/OpenColorIO.h>
#include "BitDepthUtils.h"

Expand Down Expand Up @@ -76,6 +74,8 @@ static inline void sse2RGBATranspose_4x4(__m128 row0, __m128 row1, __m128 row2,
out_a = _mm_movehl_ps(tmp3, tmp1);
}

#if !OCIO_USE_SSE2NEON

static inline __m128i sse2_blendv(__m128i a, __m128i b, __m128i mask)
{
return _mm_xor_si128(_mm_and_si128(_mm_xor_si128(a, b), mask), a);
Expand Down Expand Up @@ -164,6 +164,8 @@ static inline __m128 sse2_cvtph_ps(__m128i a)
return _mm_or_ps(o, sign);
}

#endif

// Note Packing functions perform no 0.0 - 1.0 normalization
// but perform 0 - max value clamping for integer formats
template<BitDepth BD> struct SSE2RGBAPack {};
Expand Down Expand Up @@ -290,21 +292,48 @@ struct SSE2RGBAPack<BIT_DEPTH_F16>
__m128i rgba_00_01 = _mm_loadu_si128((const __m128i*)(in + 0));
__m128i rgba_02_03 = _mm_loadu_si128((const __m128i*)(in + 8));

#if OCIO_USE_SSE2NEON
// use neon hardware support for f16 to f32
__m128 rgba0 = vreinterpretq_m128_f32(
vcvt_f32_f16(vget_low_f16(vreinterpretq_f16_s64(vreinterpretq_s64_m128i(rgba_00_01))))
);
__m128 rgba1 = vreinterpretq_m128_f32(
vcvt_f32_f16(vget_high_f16(vreinterpretq_f16_s64(vreinterpretq_s64_m128i(rgba_00_01))))
);
__m128 rgba2 = vreinterpretq_m128_f32(
vcvt_f32_f16(vget_low_f16(vreinterpretq_f16_s64(vreinterpretq_s64_m128i(rgba_02_03))))
);
__m128 rgba3 = vreinterpretq_m128_f32(
vcvt_f32_f16(vget_high_f16(vreinterpretq_f16_s64(vreinterpretq_s64_m128i(rgba_02_03))))
);
#else
__m128 rgba0 = sse2_cvtph_ps(rgba_00_01);
__m128 rgba1 = sse2_cvtph_ps(_mm_shuffle_epi32(rgba_00_01, _MM_SHUFFLE(1,0,3,2)));
__m128 rgba2 = sse2_cvtph_ps(rgba_02_03);
__m128 rgba3 = sse2_cvtph_ps(_mm_shuffle_epi32(rgba_02_03, _MM_SHUFFLE(1,0,3,2)));

#endif
sse2RGBATranspose_4x4(rgba0, rgba1, rgba2, rgba3, r, g, b, a);
}

static inline void Store(half *out, __m128 r, __m128 g, __m128 b, __m128 a)
{
__m128 rgba0, rgba1, rgba2, rgba3;
__m128i rgba;

sse2RGBATranspose_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3);

#if OCIO_USE_SSE2NEON
// use neon hardware support for f32 to f16
float16x8_t rgba;
float16x4_t rgba00_01 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba0));
float16x4_t rgba03_03 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba1));
float16x4_t rgba04_05 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba2));
float16x4_t rgba06_07 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba3));
rgba = vcombine_f16(rgba00_01, rgba03_03);
vst1q_f16((float16_t *)(out+0), rgba);

rgba = vcombine_f16(rgba04_05, rgba06_07);
vst1q_f16((float16_t *)(out+8), rgba);
#else
__m128i rgba;
__m128i rgba00_01 = sse2_cvtps_ph(rgba0);
__m128i rgba02_03 = sse2_cvtps_ph(rgba1);
__m128i rgba04_05 = sse2_cvtps_ph(rgba2);
Expand All @@ -315,6 +344,7 @@ struct SSE2RGBAPack<BIT_DEPTH_F16>

rgba = _mm_xor_si128(rgba04_05, _mm_shuffle_epi32(rgba06_07, _MM_SHUFFLE(1,0,3,2)));
_mm_storeu_si128((__m128i*)(out+8), rgba);
#endif
}
};

Expand Down
10 changes: 10 additions & 0 deletions src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,23 @@ namespace {

static inline __m128 fmadd_ps_sse2(__m128 a, __m128 b, __m128 c)
{
#if OCIO_USE_SSE2NEON
return vreinterpretq_m128_f32(
vfmaq_f32(vreinterpretq_f32_m128(c), vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))
);
#else
return _mm_add_ps(_mm_mul_ps(a, b), c);
#endif
}

static inline __m128 floor_ps_sse2(__m128 v)
{
#if OCIO_USE_SSE2NEON
return _mm_floor_ps(v);
#else
// NOTE: using truncate cvtt
return _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
#endif
}


Expand Down
14 changes: 14 additions & 0 deletions src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,32 @@ struct rgbavec_sse2 {

static inline __m128 floor_ps_sse2(__m128 v)
{
#if OCIO_USE_SSE2NEON
return _mm_floor_ps(v);
#else
// NOTE: using truncate cvtt
return _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
#endif
}

static inline __m128 blendv_ps_sse2(__m128 a, __m128 b, __m128 mask)
{
#if OCIO_USE_SSE2NEON
return _mm_blendv_ps(a, b, mask);
#else
return _mm_xor_ps(_mm_and_ps(_mm_xor_ps(a, b), mask), a);
#endif
}

static inline __m128 fmadd_ps_sse2(__m128 a, __m128 b, __m128 c)
{
#if OCIO_USE_SSE2NEON
return vreinterpretq_m128_f32(
vfmaq_f32(vreinterpretq_f32_m128(c), vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))
);
#else
return _mm_add_ps(_mm_mul_ps(a, b), c);
#endif
}

static inline rgbavec_sse2 interp_tetrahedral_sse2(const Lut3DContextSSE2 &ctx, __m128 r, __m128 g, __m128 b, __m128 a)
Expand Down