jack: add ARM NEON support for sample conversions

Add a test application checking accurracy and performance win of accelerated code. Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com> Signed-off-by: Martin Jansa <Martin.Jansa@gmail.com>
author: Andreas Müller <schnitzeltony@googlemail.com> 2017-01-25 21:06:02 +0100
committer: Martin Jansa <Martin.Jansa@gmail.com> 2017-02-13 18:43:23 +0100
commit: d307c4f59deb22cc8dfecb88720b5162f39d895c (patch)
tree: 41bcc9e75f11dbf6133fd65d2c091fd76a51eaee /meta-oe/recipes-multimedia/jack/jack/0002-jack_simdtests-add-application-checking-accurracy-an.patch
parent: 0db057da47354a7a1184ebf1d0c3c6d0ecdc89aa (diff)
download: meta-openembedded-contrib-d307c4f59deb22cc8dfecb88720b5162f39d895c.tar.gz
1 files changed, 433 insertions, 0 deletions
diff --git a/meta-oe/recipes-multimedia/jack/jack/0002-jack_simdtests-add-application-checking-accurracy-an.patch b/meta-oe/recipes-multimedia/jack/jack/0002-jack_simdtests-add-application-checking-accurracy-an.patch
new file mode 100644
index 0000000000..e0c9e8ca87
--- /dev/null
+++ b/meta-oe/recipes-multimedia/jack/jack/0002-jack_simdtests-add-application-checking-accurracy-an.patch
@@ -0,0 +1,433 @@
+From d0543c0628d2c0a6d898c694003e941fa189b393 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@googlemail.com>
+Date: Sun, 15 Jan 2017 20:52:20 +0100
+Subject: [PATCH 2/2] jack_simdtests: add application checking accurracy and
+ performance of SIMD optimizations
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Upstream-Status: Submitted [1]
+
+[1] https://github.com/jackaudio/jack2/pull/250
+
+Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com>
+---
+ example-clients/simdtests.cpp | 390 ++++++++++++++++++++++++++++++++++++++++++
+ example-clients/wscript       |   3 +-
+ 2 files changed, 392 insertions(+), 1 deletion(-)
+ create mode 100644 example-clients/simdtests.cpp
+
+diff --git a/example-clients/simdtests.cpp b/example-clients/simdtests.cpp
+new file mode 100644
+index 0000000..b74d50a
+--- /dev/null
++++ b/example-clients/simdtests.cpp
+@@ -0,0 +1,390 @@
++/*
++ *  simdtests.c -- test accuraccy and performance of simd optimizations
++ *
++ *  Copyright (C) 2017 Andreas Mueller.
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or
++ *  (at your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful,
++ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *  GNU General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License
++ *  along with this program; if not, write to the Free Software
++ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++ */
++
++/* We must include all headers memops.c includes to avoid trouble with
++ * out namespace game below.
++ */
++#include <stdio.h>
++#include <string.h>
++#include <math.h>
++#include <memory.h>
++#include <stdlib.h>
++#include <stdint.h>
++#include <limits.h>
++#ifdef __linux__
++#include <endian.h>
++#endif
++#include "memops.h"
++
++#if defined (__SSE2__) && !defined (__sun__)
++#include <emmintrin.h>
++#ifdef __SSE4_1__
++#include <smmintrin.h>
++#endif
++#endif
++
++#ifdef __ARM_NEON__
++#include <arm_neon.h>
++#endif
++
++// our additional headers
++#include <time.h>
++
++/* Dirty: include mempos.c twice the second time with SIMD disabled
++ * so we can compare aceelerated non accelerated
++ */
++namespace accelerated {
++#include "../common/memops.c"
++}
++
++namespace origerated {
++#ifdef __SSE2__
++#undef __SSE2__
++#endif
++
++#ifdef __ARM_NEON__
++#undef __ARM_NEON__
++#endif
++
++#include "../common/memops.c"
++}
++
++// define conversion function types
++typedef void (*t_jack_to_integer)(
++	char *dst,
++	jack_default_audio_sample_t *src,
++	unsigned long nsamples,
++	unsigned long dst_skip,
++	dither_state_t *state);
++
++typedef void (*t_integer_to_jack)(
++	jack_default_audio_sample_t *dst,
++	char *src,
++	unsigned long nsamples,
++	unsigned long src_skip);
++
++// define/setup test case data
++typedef struct test_case_data {
++	uint32_t frame_size;
++	uint32_t sample_size;
++	bool reverse;
++	t_jack_to_integer jack_to_integer_accel;
++	t_jack_to_integer jack_to_integer_orig;
++	t_integer_to_jack integer_to_jack_accel;
++	t_integer_to_jack integer_to_jack_orig;
++	dither_state_t *ditherstate;
++	const char *name;
++} test_case_data_t;
++
++test_case_data_t test_cases[] = {
++	{
++		4,
++		3,
++		true,
++		accelerated::sample_move_d32u24_sSs,
++		origerated::sample_move_d32u24_sSs,
++		accelerated::sample_move_dS_s32u24s,
++		origerated::sample_move_dS_s32u24s,
++		NULL,
++		"32u24s" },
++	{
++		4,
++		3,
++		false,
++		accelerated::sample_move_d32u24_sS,
++		origerated::sample_move_d32u24_sS,
++		accelerated::sample_move_dS_s32u24,
++		origerated::sample_move_dS_s32u24,
++		NULL,
++		"32u24" },
++	{
++		3,
++		3,
++		true,
++		accelerated::sample_move_d24_sSs,
++		origerated::sample_move_d24_sSs,
++		accelerated::sample_move_dS_s24s,
++		origerated::sample_move_dS_s24s,
++		NULL,
++		"24s" },
++	{
++		3,
++		3,
++		false,
++		accelerated::sample_move_d24_sS,
++		origerated::sample_move_d24_sS,
++		accelerated::sample_move_dS_s24,
++		origerated::sample_move_dS_s24,
++		NULL,
++		"24" },
++	{
++		2,
++		2,
++		true,
++		accelerated::sample_move_d16_sSs,
++		origerated::sample_move_d16_sSs,
++		accelerated::sample_move_dS_s16s,
++		origerated::sample_move_dS_s16s,
++		NULL,
++		"16s" },
++	{
++		2,
++		2,
++		false,
++		accelerated::sample_move_d16_sS,
++		origerated::sample_move_d16_sS,
++		accelerated::sample_move_dS_s16,
++		origerated::sample_move_dS_s16,
++		NULL,
++		"16" },
++};
++
++// we need to repeat for better accuracy at time measurement
++const uint32_t retry_per_case = 1000;
++
++// setup test buffers
++#define TESTBUFF_SIZE 1024
++jack_default_audio_sample_t jackbuffer_source[TESTBUFF_SIZE];
++// integer buffers: max 4 bytes per value / * 2 for stereo
++char integerbuffer_accel[TESTBUFF_SIZE*4*2];
++char integerbuffer_orig[TESTBUFF_SIZE*4*2];
++// float buffers
++jack_default_audio_sample_t jackfloatbuffer_accel[TESTBUFF_SIZE];
++jack_default_audio_sample_t jackfloatbuffer_orig[TESTBUFF_SIZE];
++
++// comparing unsigned makes life easier
++uint32_t extract_integer(
++	char* buff,
++	uint32_t offset,
++	uint32_t frame_size,
++	uint32_t sample_size,
++	bool big_endian)
++{
++	uint32_t retval = 0;
++	unsigned char* curr;
++	uint32_t mult = 1;
++	if(big_endian) {
++		curr = (unsigned char*)buff + offset + sample_size-1;
++		for(uint32_t i=0; i<sample_size; i++) {
++			retval += *(curr--) * mult;
++			mult*=256;
++		}
++	}
++	else {
++		curr = (unsigned char*)buff + offset + frame_size-sample_size;
++		for(uint32_t i=0; i<sample_size; i++) {
++			retval += *(curr++) * mult;
++			mult*=256;
++		}
++	}
++	return retval;
++}
++
++int main(int argc, char *argv[])
++{
++//	parse_arguments(argc, argv);
++	uint32_t maxerr_displayed = 10;
++
++	// fill jackbuffer
++	for(int i=0; i<TESTBUFF_SIZE; i++) {
++		// ramp
++		jack_default_audio_sample_t value =
++			((jack_default_audio_sample_t)((i % TESTBUFF_SIZE) - TESTBUFF_SIZE/2)) / (TESTBUFF_SIZE/2);
++		// force clipping
++		value *= 1.02;
++		jackbuffer_source[i] = value;
++	}
++
++	for(uint32_t testcase=0; testcase<sizeof(test_cases)/sizeof(test_case_data_t); testcase++) {
++		// test mono/stereo
++		for(uint32_t channels=1; channels<=2; channels++) {
++			//////////////////////////////////////////////////////////////////////////////
++			// jackfloat -> integer
++
++			// clean target buffers
++			memset(integerbuffer_accel, 0, sizeof(integerbuffer_accel));
++			memset(integerbuffer_orig, 0, sizeof(integerbuffer_orig));
++			// accel
++			clock_t time_to_integer_accel = clock();
++			for(uint32_t repetition=0; repetition<retry_per_case; repetition++)
++			{
++				test_cases[testcase].jack_to_integer_accel(
++					integerbuffer_accel,
++					jackbuffer_source,
++					TESTBUFF_SIZE,
++					test_cases[testcase].frame_size*channels,
++					test_cases[testcase].ditherstate);
++			}
++			float timediff_to_integer_accel = ((float)(clock() - time_to_integer_accel)) / CLOCKS_PER_SEC;
++			// orig
++			clock_t time_to_integer_orig = clock();
++			for(uint32_t repetition=0; repetition<retry_per_case; repetition++)
++			{
++				test_cases[testcase].jack_to_integer_orig(
++					integerbuffer_orig,
++					jackbuffer_source,
++					TESTBUFF_SIZE,
++					test_cases[testcase].frame_size*channels,
++					test_cases[testcase].ditherstate);
++			}
++			float timediff_to_integer_orig = ((float)(clock() - time_to_integer_orig)) / CLOCKS_PER_SEC;
++			// output performance results
++			printf(
++				"JackFloat->Integer @%7.7s/%u: Orig %7.6f sec / Accel %7.6f sec -> Win: %5.2f %%\n",
++				test_cases[testcase].name,
++				channels,
++				timediff_to_integer_orig,
++				timediff_to_integer_accel,
++				(timediff_to_integer_orig/timediff_to_integer_accel-1)*100.0);
++			uint32_t int_deviation_max = 0;
++			uint32_t int_error_count = 0;
++			// output error (avoid spam -> limit error lines per test case)
++			for(uint32_t sample=0; sample<TESTBUFF_SIZE; sample++) {
++				uint32_t sample_offset = sample*test_cases[testcase].frame_size*channels;
++				// compare both results
++				uint32_t intval_accel=extract_integer(
++					integerbuffer_accel,
++					sample_offset,
++					test_cases[testcase].frame_size,
++					test_cases[testcase].sample_size,
++#if __BYTE_ORDER == __BIG_ENDIAN
++					!test_cases[testcase].reverse);
++#else
++					test_cases[testcase].reverse);
++#endif
++				uint32_t intval_orig=extract_integer(
++					integerbuffer_orig,
++					sample_offset,
++					test_cases[testcase].frame_size,
++					test_cases[testcase].sample_size,
++#if __BYTE_ORDER == __BIG_ENDIAN
++					!test_cases[testcase].reverse);
++#else
++					test_cases[testcase].reverse);
++#endif
++				if(intval_accel != intval_orig) {
++					if(int_error_count<maxerr_displayed) {
++						printf("Value error sample %u:", sample);
++						printf(" Orig 0x");
++						char formatstr[10];
++						sprintf(formatstr, "%%0%uX", test_cases[testcase].sample_size*2);
++						printf(formatstr, intval_orig);
++						printf(" Accel 0x");
++						printf(formatstr, intval_accel);
++						printf("\n");
++					}
++					int_error_count++;
++					uint32_t int_deviation;
++					if(intval_accel > intval_orig)
++						int_deviation = intval_accel-intval_orig;
++					else
++						int_deviation = intval_orig-intval_accel;
++					if(int_deviation > int_deviation_max)
++						int_deviation_max = int_deviation;
++				}
++			}
++			printf(
++				"JackFloat->Integer @%7.7s/%u: Errors: %u Max deviation %u\n",
++				test_cases[testcase].name,
++				channels,
++				int_error_count,
++				int_deviation_max);
++
++			//////////////////////////////////////////////////////////////////////////////
++			// integer -> jackfloat
++
++			// clean target buffers
++			memset(jackfloatbuffer_accel, 0, sizeof(jackfloatbuffer_accel));
++			memset(jackfloatbuffer_orig, 0, sizeof(jackfloatbuffer_orig));
++			// accel
++			clock_t time_to_float_accel = clock();
++			for(uint32_t repetition=0; repetition<retry_per_case; repetition++)
++			{
++				test_cases[testcase].integer_to_jack_accel(
++					jackfloatbuffer_accel,
++					integerbuffer_orig,
++					TESTBUFF_SIZE,
++					test_cases[testcase].frame_size*channels);
++			}
++			float timediff_to_float_accel = ((float)(clock() - time_to_float_accel)) / CLOCKS_PER_SEC;
++			// orig
++			clock_t time_to_float_orig = clock();
++			for(uint32_t repetition=0; repetition<retry_per_case; repetition++)
++			{
++				test_cases[testcase].integer_to_jack_orig(
++					jackfloatbuffer_orig,
++					integerbuffer_orig,
++					TESTBUFF_SIZE,
++					test_cases[testcase].frame_size*channels);
++			}
++			float timediff_to_float_orig = ((float)(clock() - time_to_float_orig)) / CLOCKS_PER_SEC;
++			// output performance results
++			printf(
++				"Integer->JackFloat @%7.7s/%u: Orig %7.6f sec / Accel %7.6f sec -> Win: %5.2f %%\n",
++				test_cases[testcase].name,
++				channels,
++				timediff_to_float_orig,
++				timediff_to_float_accel,
++				(timediff_to_float_orig/timediff_to_float_accel-1)*100.0);
++			jack_default_audio_sample_t float_deviation_max = 0.0;
++			uint32_t float_error_count = 0;
++			// output error (avoid spam -> limit error lines per test case)
++			for(uint32_t sample=0; sample<TESTBUFF_SIZE; sample++) {
++				// For easier estimation/readabilty we scale floats back to integer
++				jack_default_audio_sample_t sample_scaling;
++				switch(test_cases[testcase].sample_size) {
++					case 2:
++						sample_scaling = SAMPLE_16BIT_SCALING;
++						break;
++					default:
++						sample_scaling = SAMPLE_24BIT_SCALING;
++						break;
++				}
++				jack_default_audio_sample_t floatval_accel = jackfloatbuffer_accel[sample] * sample_scaling;
++				jack_default_audio_sample_t floatval_orig = jackfloatbuffer_orig[sample] * sample_scaling;
++				// compare both results
++				jack_default_audio_sample_t float_deviation;
++				if(floatval_accel > floatval_orig)
++					float_deviation = floatval_accel-floatval_orig;
++				else
++					float_deviation = floatval_orig-floatval_accel;
++				if(float_deviation > float_deviation_max)
++					float_deviation_max = float_deviation;
++				// deviation > half bit => error
++				if(float_deviation > 0.5) {
++					if(float_error_count<maxerr_displayed) {
++						printf("Value error sample %u:", sample);
++						printf(" Orig %8.1f Accel %8.1f\n", floatval_orig, floatval_accel);
++					}
++					float_error_count++;
++				}
++			}
++			printf(
++				"Integer->JackFloat @%7.7s/%u: Errors: %u Max deviation %f\n",
++				test_cases[testcase].name,
++				channels,
++				float_error_count,
++				float_deviation_max);
++
++			printf("\n");
++		}
++	}
++	return 0;
++}
+diff --git a/example-clients/wscript b/example-clients/wscript
+index ba67614..1b2f674 100644
+--- a/example-clients/wscript
++++ b/example-clients/wscript
+@@ -28,7 +28,8 @@ example_programs = {
+     'jack_net_master' : 'netmaster.c',
+     'jack_latent_client' : 'latent_client.c',
+     'jack_midi_dump' : 'midi_dump.c',
+-    'jack_midi_latency_test' : 'midi_latency_test.c'
++    'jack_midi_latency_test' : 'midi_latency_test.c',
++    'jack_simdtests' : 'simdtests.cpp'
+     }
+ 
+ example_libs = {
+-- 
+2.5.5
+
author	Andreas Müller <schnitzeltony@googlemail.com>	2017-01-25 21:06:02 +0100
committer	Martin Jansa <Martin.Jansa@gmail.com>	2017-02-13 18:43:23 +0100
commit	d307c4f59deb22cc8dfecb88720b5162f39d895c (patch)
tree	41bcc9e75f11dbf6133fd65d2c091fd76a51eaee /meta-oe/recipes-multimedia/jack/jack/0002-jack_simdtests-add-application-checking-accurracy-an.patch
parent	0db057da47354a7a1184ebf1d0c3c6d0ecdc89aa (diff)
download	meta-openembedded-contrib-d307c4f59deb22cc8dfecb88720b5162f39d895c.tar.gz