aboutsummaryrefslogtreecommitdiffstats
path: root/meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch')
-rw-r--r--meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch496
1 files changed, 496 insertions, 0 deletions
diff --git a/meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch b/meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch
new file mode 100644
index 0000000000..76ec7136b3
--- /dev/null
+++ b/meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch
@@ -0,0 +1,496 @@
+From 99785aabc685a94415fcd445345c093488e10350 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@googlemail.com>
+Date: Fri, 13 Jan 2017 22:42:11 +0100
+Subject: [PATCH 1/2] Add ARM-NEON acceleration for all non-dithering sample
+ conversion functions
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Upstream-Status: Submitted [1]
+
+[1] https://github.com/jackaudio/jack2/pull/250
+
+Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com>
+---
+ common/memops.c | 356 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 351 insertions(+), 5 deletions(-)
+
+diff --git a/common/memops.c b/common/memops.c
+index 2ff0792..8f9ece2 100644
+--- a/common/memops.c
++++ b/common/memops.c
+@@ -42,6 +42,10 @@
+ #endif
+ #endif
+
++#ifdef __ARM_NEON__
++#include <arm_neon.h>
++#endif
++
+ /* Notes about these *_SCALING values.
+
+ the MAX_<N>BIT values are floating point. when multiplied by
+@@ -193,6 +197,35 @@ static inline __m128i float_24_sse(__m128 s)
+ }
+ #endif
+
++
++#ifdef __ARM_NEON__
++
++static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max)
++{
++ return vminq_f32(max, vmaxq_f32(s, min));
++}
++
++static inline int32x4_t float_24_neon(float32x4_t s)
++{
++ const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
++ const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
++
++ float32x4_t clipped = clip(s, lower_bound, upper_bound);
++ float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_24BIT_SCALING));
++ return vcvtq_s32_f32(scaled);
++}
++
++static inline int16x4_t float_16_neon(float32x4_t s)
++{
++ const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
++ const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
++
++ float32x4_t clipped = clip(s, lower_bound, upper_bound);
++ float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_16BIT_SCALING));
++ return vmovn_s32(vcvtq_s32_f32(scaled));
++}
++#endif
++
+ /* Linear Congruential noise generator. From the music-dsp list
+ * less random than rand(), but good enough and 10x faster
+ */
+@@ -248,6 +281,32 @@ void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsign
+
+ void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
+ {
++#ifdef __ARM_NEON__
++ unsigned long unrolled = nsamples / 4;
++ nsamples = nsamples & 3;
++
++ while (unrolled--) {
++ float32x4_t samples = vld1q_f32(src);
++ int32x4_t converted = float_24_neon(samples);
++ int32x4_t shifted = vshlq_n_s32(converted, 8);
++ shifted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(shifted)));
++
++ switch(dst_skip) {
++ case 4:
++ vst1q_s32((int32_t*)dst, shifted);
++ break;
++ default:
++ vst1q_lane_s32((int32_t*)(dst), shifted, 0);
++ vst1q_lane_s32((int32_t*)(dst+dst_skip), shifted, 1);
++ vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
++ vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
++ break;
++ }
++ dst += 4*dst_skip;
++ src+= 4;
++ }
++#endif
++
+ int32_t z;
+
+ while (nsamples--) {
+@@ -321,7 +380,33 @@ void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigne
+ src++;
+ }
+
+-#else
++#elif defined(__ARM_NEON__)
++ unsigned long unrolled = nsamples / 4;
++ nsamples = nsamples & 3;
++
++ while (unrolled--) {
++ float32x4_t samples = vld1q_f32(src);
++ int32x4_t converted = float_24_neon(samples);
++ int32x4_t shifted = vshlq_n_s32(converted, 8);
++
++ switch(dst_skip) {
++ case 4:
++ vst1q_s32((int32_t*)dst, shifted);
++ break;
++ default:
++ vst1q_lane_s32((int32_t*)(dst), shifted, 0);
++ vst1q_lane_s32((int32_t*)(dst+dst_skip), shifted, 1);
++ vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
++ vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
++ break;
++ }
++ dst += 4*dst_skip;
++
++ src+= 4;
++ }
++#endif
++
++#if !defined (__SSE2__)
+ while (nsamples--) {
+ float_24u32 (*src, *((int32_t*) dst));
+ dst += dst_skip;
+@@ -332,6 +417,38 @@ void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigne
+
+ void sample_move_dS_s32u24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
+ {
++#ifdef __ARM_NEON__
++ float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
++ unsigned long unrolled = nsamples / 4;
++ while (unrolled--) {
++ int32x4_t src128;
++ switch(src_skip)
++ {
++ case 4:
++ src128 = vld1q_s32((int32_t*)src);
++ break;
++ case 8:
++ src128 = vld2q_s32((int32_t*)src).val[0];
++ break;
++ default:
++ src128 = vld1q_lane_s32((int32_t*)src, src128, 0);
++ src128 = vld1q_lane_s32((int32_t*)(src+src_skip), src128, 1);
++ src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
++ src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
++ break;
++ }
++ src128 = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(src128)));
++ int32x4_t shifted = vshrq_n_s32(src128, 8);
++ float32x4_t as_float = vcvtq_f32_s32(shifted);
++ float32x4_t divided = vmulq_f32(as_float, factor);
++ vst1q_f32(dst, divided);
++
++ src += 4*src_skip;
++ dst += 4;
++ }
++ nsamples = nsamples & 3;
++#endif
++
+ /* ALERT: signed sign-extension portability !!! */
+
+ const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
+@@ -389,6 +506,34 @@ void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigne
+ dst += 4;
+ }
+ nsamples = nsamples & 3;
++#elif defined(__ARM_NEON__)
++ unsigned long unrolled = nsamples / 4;
++ float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
++ while (unrolled--) {
++ int32x4_t src128;
++ switch(src_skip) {
++ case 4:
++ src128 = vld1q_s32((int32_t*)src);
++ break;
++ case 8:
++ src128 = vld2q_s32((int32_t*)src).val[0];
++ break;
++ default:
++ src128 = vld1q_lane_s32((int32_t*)src, src128, 0);
++ src128 = vld1q_lane_s32((int32_t*)(src+src_skip), src128, 1);
++ src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
++ src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
++ break;
++ }
++ int32x4_t shifted = vshrq_n_s32(src128, 8);
++ float32x4_t as_float = vcvtq_f32_s32(shifted);
++ float32x4_t divided = vmulq_f32(as_float, factor);
++ vst1q_f32(dst, divided);
++
++ src += 4*src_skip;
++ dst += 4;
++ }
++ nsamples = nsamples & 3;
+ #endif
+
+ /* ALERT: signed sign-extension portability !!! */
+@@ -403,6 +548,24 @@ void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigne
+
+ void sample_move_d24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
+ {
++#ifdef __ARM_NEON__
++ unsigned long unrolled = nsamples / 4;
++ while (unrolled--) {
++ int32_t z[4];
++ float32x4_t samples = vld1q_f32(src);
++ int32x4_t converted = float_24_neon(samples);
++ converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted)));
++ vst1q_s32(z, converted);
++
++ for (int i = 0; i != 4; ++i) {
++ memcpy (dst, ((char*)(z+i))+1, 3);
++ dst += dst_skip;
++ }
++ src += 4;
++ }
++ nsamples = nsamples & 3;
++#endif
++
+ int32_t z;
+
+ while (nsamples--) {
+@@ -426,7 +589,6 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l
+ #if defined (__SSE2__) && !defined (__sun__)
+ _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+ while (nsamples >= 4) {
+- int i;
+ int32_t z[4];
+ __m128 samples = _mm_loadu_ps(src);
+ __m128i converted = float_24_sse(samples);
+@@ -447,7 +609,7 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l
+ _mm_store_ss((float*)z+3, (__m128)shuffled3);
+ #endif
+
+- for (i = 0; i != 4; ++i) {
++ for (int i = 0; i != 4; ++i) {
+ memcpy (dst, z+i, 3);
+ dst += dst_skip;
+ }
+@@ -455,6 +617,22 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l
+ nsamples -= 4;
+ src += 4;
+ }
++#elif defined(__ARM_NEON__)
++ unsigned long unrolled = nsamples / 4;
++ while (unrolled--) {
++ int i;
++ int32_t z[4];
++ float32x4_t samples = vld1q_f32(src);
++ int32x4_t converted = float_24_neon(samples);
++ vst1q_s32(z, converted);
++
++ for (i = 0; i != 4; ++i) {
++ memcpy (dst, z+i, 3);
++ dst += dst_skip;
++ }
++ src += 4;
++ }
++ nsamples = nsamples & 3;
+ #endif
+
+ int32_t z;
+@@ -473,9 +651,41 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l
+
+ void sample_move_dS_s24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
+ {
++ const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
++
++#ifdef __ARM_NEON__
++ // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
++ const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
++ int32_t x[4];
++ memset(x, 0, sizeof(x));
++ unsigned long unrolled = nsamples / 4;
++ while (unrolled--) {
++#if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
++ // right aligned / inverse sequence below -> *256
++ memcpy(((char*)&x[0])+1, src, 3);
++ memcpy(((char*)&x[1])+1, src+src_skip, 3);
++ memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
++ memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
++#else
++ memcpy(&x[0], src, 3);
++ memcpy(&x[1], src+src_skip, 3);
++ memcpy(&x[2], src+2*src_skip, 3);
++ memcpy(&x[3], src+3*src_skip, 3);
++#endif
++ src += 4 * src_skip;
++
++ int32x4_t source = vld1q_s32(x);
++ source = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(source)));
++ float32x4_t converted = vcvtq_f32_s32(source);
++ float32x4_t scaled = vmulq_f32(converted, vscaling);
++ vst1q_f32(dst, scaled);
++ dst += 4;
++ }
++ nsamples = nsamples & 3;
++#endif
++
+ /* ALERT: signed sign-extension portability !!! */
+
+- const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
+ while (nsamples--) {
+ int x;
+ #if __BYTE_ORDER == __LITTLE_ENDIAN
+@@ -528,6 +738,34 @@ void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned l
+ dst += 4;
+ nsamples -= 4;
+ }
++#elif defined(__ARM_NEON__)
++ // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
++ const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
++ int32_t x[4];
++ memset(x, 0, sizeof(x));
++ unsigned long unrolled = nsamples / 4;
++ while (unrolled--) {
++#if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
++ // left aligned -> *256
++ memcpy(&x[0], src, 3);
++ memcpy(&x[1], src+src_skip, 3);
++ memcpy(&x[2], src+2*src_skip, 3);
++ memcpy(&x[3], src+3*src_skip, 3);
++#else
++ memcpy(((char*)&x[0])+1, src, 3);
++ memcpy(((char*)&x[1])+1, src+src_skip, 3);
++ memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
++ memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
++#endif
++ src += 4 * src_skip;
++
++ int32x4_t source = vld1q_s32(x);
++ float32x4_t converted = vcvtq_f32_s32(source);
++ float32x4_t scaled = vmulq_f32(converted, vscaling);
++ vst1q_f32(dst, scaled);
++ dst += 4;
++ }
++ nsamples = nsamples & 3;
+ #endif
+
+ while (nsamples--) {
+@@ -547,6 +785,30 @@ void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned l
+
+ void sample_move_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
+ {
++#ifdef __ARM_NEON__
++ unsigned long unrolled = nsamples / 4;
++ nsamples = nsamples & 3;
++
++ while (unrolled--) {
++ float32x4_t samples = vld1q_f32(src);
++ int16x4_t converted = float_16_neon(samples);
++ converted = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(converted)));
++
++ switch(dst_skip) {
++ case 2:
++ vst1_s16((int16_t*)dst, converted);
++ break;
++ default:
++ vst1_lane_s16((int16_t*)(dst), converted, 0);
++ vst1_lane_s16((int16_t*)(dst+dst_skip), converted, 1);
++ vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
++ vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
++ break;
++ }
++ dst += 4*dst_skip;
++ src+= 4;
++ }
++#endif
+ int16_t tmp;
+
+ while (nsamples--) {
+@@ -574,6 +836,29 @@ void sample_move_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned
+
+ void sample_move_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
+ {
++#ifdef __ARM_NEON__
++ unsigned long unrolled = nsamples / 4;
++ nsamples = nsamples & 3;
++
++ while (unrolled--) {
++ float32x4_t samples = vld1q_f32(src);
++ int16x4_t converted = float_16_neon(samples);
++
++ switch(dst_skip) {
++ case 2:
++ vst1_s16((int16_t*)dst, converted);
++ break;
++ default:
++ vst1_lane_s16((int16_t*)(dst), converted, 0);
++ vst1_lane_s16((int16_t*)(dst+dst_skip), converted, 1);
++ vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
++ vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
++ break;
++ }
++ dst += 4*dst_skip;
++ src+= 4;
++ }
++#endif
+ while (nsamples--) {
+ float_16 (*src, *((int16_t*) dst));
+ dst += dst_skip;
+@@ -728,8 +1013,39 @@ void sample_move_dither_shaped_d16_sS (char *dst, jack_default_audio_sample_t *
+
+ void sample_move_dS_s16s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
+ {
+- short z;
+ const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
++#ifdef __ARM_NEON__
++ const float32x4_t vscaling = vdupq_n_f32(scaling);
++ unsigned long unrolled = nsamples / 4;
++ while (unrolled--) {
++ int16x4_t source16x4;
++ switch(src_skip) {
++ case 2:
++ source16x4 = vld1_s16((int16_t*)src);
++ break;
++ case 4:
++ source16x4 = vld2_s16((int16_t*)src).val[0];
++ break;
++ default:
++ source16x4 = vld1_lane_s16((int16_t*)src, source16x4, 0);
++ source16x4 = vld1_lane_s16((int16_t*)(src+src_skip), source16x4, 1);
++ source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
++ source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
++ break;
++ }
++ source16x4 = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(source16x4)));
++ int32x4_t source32x4 = vmovl_s16(source16x4);
++ src += 4 * src_skip;
++
++ float32x4_t converted = vcvtq_f32_s32(source32x4);
++ float32x4_t scaled = vmulq_f32(converted, vscaling);
++ vst1q_f32(dst, scaled);
++ dst += 4;
++ }
++ nsamples = nsamples & 3;
++#endif
++
++ short z;
+
+ /* ALERT: signed sign-extension portability !!! */
+ while (nsamples--) {
+@@ -752,6 +1068,36 @@ void sample_move_dS_s16 (jack_default_audio_sample_t *dst, char *src, unsigned l
+ {
+ /* ALERT: signed sign-extension portability !!! */
+ const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
++#ifdef __ARM_NEON__
++ const float32x4_t vscaling = vdupq_n_f32(scaling);
++ unsigned long unrolled = nsamples / 4;
++ while (unrolled--) {
++ int16x4_t source16x4;
++ switch(src_skip) {
++ case 2:
++ source16x4 = vld1_s16((int16_t*)src);
++ break;
++ case 4:
++ source16x4 = vld2_s16((int16_t*)src).val[0];
++ break;
++ default:
++ source16x4 = vld1_lane_s16((int16_t*)src, source16x4, 0);
++ source16x4 = vld1_lane_s16((int16_t*)(src+src_skip), source16x4, 1);
++ source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
++ source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
++ break;
++ }
++ int32x4_t source32x4 = vmovl_s16(source16x4);
++ src += 4 * src_skip;
++
++ float32x4_t converted = vcvtq_f32_s32(source32x4);
++ float32x4_t scaled = vmulq_f32(converted, vscaling);
++ vst1q_f32(dst, scaled);
++ dst += 4;
++ }
++ nsamples = nsamples & 3;
++#endif
++
+ while (nsamples--) {
+ *dst = (*((short *) src)) * scaling;
+ dst++;
+--
+2.5.5
+