diff options
author | Koen Kooi <koen@openembedded.org> | 2011-04-05 13:00:12 +0200 |
---|---|---|
committer | Koen Kooi <koen@openembedded.org> | 2011-04-05 15:07:59 +0200 |
commit | c3265b14b23e1aec54f7794e753b28f0d0622d86 (patch) | |
tree | c84f6cda614e47d02328b19eccb99bb6c6b34aeb /recipes/xorg-lib/pixman-0.21.6 | |
parent | 84f0436d63aef5fce34eb0c6d5b07a4e312b7049 (diff) | |
download | openembedded-c3265b14b23e1aec54f7794e753b28f0d0622d86.tar.gz |
pixman: add 0.21.6 + fixes
Signed-off-by: Koen Kooi <koen@openembedded.org>
Acked-by: Martin Jansa <Martin.Jansa@gmail.com>
Diffstat (limited to 'recipes/xorg-lib/pixman-0.21.6')
39 files changed, 5504 insertions, 0 deletions
diff --git a/recipes/xorg-lib/pixman-0.21.6/0002-Fix-compilation-on-Win32.patch b/recipes/xorg-lib/pixman-0.21.6/0002-Fix-compilation-on-Win32.patch new file mode 100644 index 0000000000..16b6ff13f9 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0002-Fix-compilation-on-Win32.patch @@ -0,0 +1,42 @@ +From 20ed723a5a42fb8636bc9a5f32974dec1b66a785 Mon Sep 17 00:00:00 2001 +From: Andrea Canciani <ranma42@gmail.com> +Date: Thu, 24 Feb 2011 10:44:04 +0100 +Subject: [PATCH 02/40] Fix compilation on Win32 + +Makefile.win32 contained a typo and was missing the dependency from +the built sources. +--- + pixman/Makefile.win32 | 6 ++++-- + 1 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/pixman/Makefile.win32 b/pixman/Makefile.win32 +index 775fb5e..b5f9397 100644 +--- a/pixman/Makefile.win32 ++++ b/pixman/Makefile.win32 +@@ -56,6 +56,8 @@ SOURCES = \ + pixman-general.c \ + $(NULL) + ++BUILT_SOURCES = pixman-combine32.h pixman-combine32.c pixman-combine64.h pixman-combine64.c ++ + # MMX compilation flags + ifeq ($(MMX_VAR),on) + CFLAGS += $(MMX_CFLAGS) +@@ -122,7 +124,7 @@ endif + endif + + # pixman compilation and linking +-$(CFG_VAR)/%.obj: %.c ++$(CFG_VAR)/%.obj: %.c $(BUILT_SOURCES) + @mkdir -p $(CFG_VAR) + @$(CC) -c $(CFLAGS) -Fo"$@" $< + +@@ -141,4 +143,4 @@ pixman-combine64.h: pixman-combine.h.template make-combine.pl + + clean_r: + @rm -f $(CFG_VAR)/*.obj $(CFG_VAR)/*.lib $(CFG_VAR)/*.pdb $(CFG)/*.ilk || exit 0 +- @rm -f $(CFG)/*.obj $(CFG)/*.lib $(CFG)/*.pdb $(CFG)/*.ilk pixman-combine32.c pixman-combine64.c pixman-combine64.c pixman-combine64.h || exit 0 ++ @rm -f $(CFG)/*.obj $(CFG)/*.lib $(CFG)/*.pdb $(CFG)/*.ilk $(BUILT_SOURCES) || exit 0 +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0003-test-Fix-tests-for-compilation-on-Windows.patch b/recipes/xorg-lib/pixman-0.21.6/0003-test-Fix-tests-for-compilation-on-Windows.patch new file mode 100644 index 0000000000..33351a991c --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0003-test-Fix-tests-for-compilation-on-Windows.patch @@ -0,0 +1,232 @@ +From 11305b4ecdd36a17592c5c75de9157874853ab20 Mon Sep 17 00:00:00 2001 +From: Andrea Canciani <ranma42@gmail.com> +Date: Tue, 22 Feb 2011 21:46:37 +0100 +Subject: [PATCH 03/40] test: Fix tests for compilation on Windows + +The Microsoft C compiler cannot handle subobject initialization and +Win32 does not provide snprintf. + +Work around these limitations by using normal struct initialization +and using sprintf (a manual check shows that the buffer size is +sufficient). +--- + test/composite.c | 29 +++++++++++++-------------- + test/fetch-test.c | 52 ++++++++++++++++++++++---------------------------- + test/trap-crasher.c | 20 +++++++++--------- + 3 files changed, 47 insertions(+), 54 deletions(-) + +diff --git a/test/composite.c b/test/composite.c +index e14f954..08c6689 100644 +--- a/test/composite.c ++++ b/test/composite.c +@@ -617,18 +617,18 @@ eval_diff (color_t *expected, color_t *test, pixman_format_code_t format) + } + + static char * +-describe_image (image_t *info, char *buf, int buflen) ++describe_image (image_t *info, char *buf) + { + if (info->size) + { +- snprintf (buf, buflen, "%s %dx%d%s", +- info->format->name, +- info->size, info->size, +- info->repeat ? "R" :""); ++ sprintf (buf, "%s %dx%d%s", ++ info->format->name, ++ info->size, info->size, ++ info->repeat ? "R" :""); + } + else + { +- snprintf (buf, buflen, "solid"); ++ sprintf (buf, "solid"); + } + + return buf; +@@ -710,10 +710,9 @@ composite_test (image_t *dst, + { + char buf[40]; + +- snprintf (buf, sizeof (buf), +- "%s %scomposite", +- op->name, +- component_alpha ? "CA " : ""); ++ sprintf (buf, "%s %scomposite", ++ op->name, ++ component_alpha ? "CA " : ""); + + printf ("%s test error of %.4f --\n" + " R G B A\n" +@@ -735,9 +734,9 @@ composite_test (image_t *dst, + mask->color->b, mask->color->a, + dst->color->r, dst->color->g, + dst->color->b, dst->color->a); +- printf ("src: %s, ", describe_image (src, buf, sizeof (buf))); +- printf ("mask: %s, ", describe_image (mask, buf, sizeof (buf))); +- printf ("dst: %s\n\n", describe_image (dst, buf, sizeof (buf))); ++ printf ("src: %s, ", describe_image (src, buf)); ++ printf ("mask: %s, ", describe_image (mask, buf)); ++ printf ("dst: %s\n\n", describe_image (dst, buf)); + } + else + { +@@ -747,8 +746,8 @@ composite_test (image_t *dst, + src->color->b, src->color->a, + dst->color->r, dst->color->g, + dst->color->b, dst->color->a); +- printf ("src: %s, ", describe_image (src, buf, sizeof (buf))); +- printf ("dst: %s\n\n", describe_image (dst, buf, sizeof (buf))); ++ printf ("src: %s, ", describe_image (src, buf)); ++ printf ("dst: %s\n\n", describe_image (dst, buf)); + } + + success = FALSE; +diff --git a/test/fetch-test.c b/test/fetch-test.c +index 2ca16dd..314a072 100644 +--- a/test/fetch-test.c ++++ b/test/fetch-test.c +@@ -8,7 +8,7 @@ + + static pixman_indexed_t mono_palette = + { +- .rgba = { 0x00000000, 0x00ffffff }, ++ 0, { 0x00000000, 0x00ffffff }, + }; + + +@@ -24,57 +24,53 @@ typedef struct { + static testcase_t testcases[] = + { + { +- .format = PIXMAN_a8r8g8b8, +- .width = 2, .height = 2, +- .stride = 8, +- .src = { 0x00112233, 0x44556677, +- 0x8899aabb, 0xccddeeff }, +- .dst = { 0x00112233, 0x44556677, +- 0x8899aabb, 0xccddeeff }, +- .indexed = NULL, ++ PIXMAN_a8r8g8b8, ++ 2, 2, ++ 8, ++ { 0x00112233, 0x44556677, ++ 0x8899aabb, 0xccddeeff }, ++ { 0x00112233, 0x44556677, ++ 0x8899aabb, 0xccddeeff }, ++ NULL, + }, + { +- .format = PIXMAN_g1, +- .width = 8, .height = 2, +- .stride = 4, ++ PIXMAN_g1, ++ 8, 2, ++ 4, + #ifdef WORDS_BIGENDIAN +- .src = + { + 0xaa000000, + 0x55000000 + }, + #else +- .src = + { + 0x00000055, + 0x000000aa + }, + #endif +- .dst = + { + 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, + 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff + }, +- .indexed = &mono_palette, ++ &mono_palette, + }, + #if 0 + { +- .format = PIXMAN_g8, +- .width = 4, .height = 2, +- .stride = 4, +- .src = { 0x01234567, +- 0x89abcdef }, +- .dst = { 0x00010101, 0x00232323, 0x00454545, 0x00676767, +- 0x00898989, 0x00ababab, 0x00cdcdcd, 0x00efefef, }, ++ PIXMAN_g8, ++ 4, 2, ++ 4, ++ { 0x01234567, ++ 0x89abcdef }, ++ { 0x00010101, 0x00232323, 0x00454545, 0x00676767, ++ 0x00898989, 0x00ababab, 0x00cdcdcd, 0x00efefef, }, + }, + #endif + /* FIXME: make this work on big endian */ + { +- .format = PIXMAN_yv12, +- .width = 8, .height = 2, +- .stride = 8, ++ PIXMAN_yv12, ++ 8, 2, ++ 8, + #ifdef WORDS_BIGENDIAN +- .src = + { + 0x00ff00ff, 0x00ff00ff, + 0xff00ff00, 0xff00ff00, +@@ -82,7 +78,6 @@ static testcase_t testcases[] = + 0x800080ff + }, + #else +- .src = + { + 0xff00ff00, 0xff00ff00, + 0x00ff00ff, 0x00ff00ff, +@@ -90,7 +85,6 @@ static testcase_t testcases[] = + 0xff800080 + }, + #endif +- .dst = + { + 0xff000000, 0xffffffff, 0xffb80000, 0xffffe113, + 0xff000000, 0xffffffff, 0xff0023ee, 0xff4affff, +diff --git a/test/trap-crasher.c b/test/trap-crasher.c +index 42b82f6..7485e62 100644 +--- a/test/trap-crasher.c ++++ b/test/trap-crasher.c +@@ -7,21 +7,21 @@ main() + pixman_image_t *dst; + pixman_trapezoid_t traps[1] = { + { +- .top = 2147483646, +- .bottom = 2147483647, +- .left = { +- .p1 = { .x = 0, .y = 0 }, +- .p2 = { .x = 0, .y = 2147483647 } ++ 2147483646, ++ 2147483647, ++ { ++ { 0, 0 }, ++ { 0, 2147483647 } + }, +- .right = { +- .p1 = { .x = 65536, .y = 0 }, +- .p2 = { .x = 0, .y = 2147483647 } ++ { ++ { 65536, 0 }, ++ { 0, 2147483647 } + } + }, + }; +- ++ + dst = pixman_image_create_bits (PIXMAN_a8, 1, 1, NULL, -1); +- ++ + pixman_add_trapezoids (dst, 0, 0, sizeof (traps)/sizeof (traps[0]), traps); + return (0); + } +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0004-test-Add-Makefile-for-Win32.patch b/recipes/xorg-lib/pixman-0.21.6/0004-test-Add-Makefile-for-Win32.patch new file mode 100644 index 0000000000..94ed0b4308 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0004-test-Add-Makefile-for-Win32.patch @@ -0,0 +1,92 @@ +From 72f5e5f608506c18c484bc5bc3e58bd83aeb7691 Mon Sep 17 00:00:00 2001 +From: Andrea Canciani <ranma42@gmail.com> +Date: Tue, 22 Feb 2011 22:04:49 +0100 +Subject: [PATCH 04/40] test: Add Makefile for Win32 + +--- + test/Makefile.win32 | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 files changed, 73 insertions(+), 0 deletions(-) + create mode 100644 test/Makefile.win32 + +diff --git a/test/Makefile.win32 b/test/Makefile.win32 +new file mode 100644 +index 0000000..c71afe1 +--- /dev/null ++++ b/test/Makefile.win32 +@@ -0,0 +1,73 @@ ++CC = cl ++LINK = link ++ ++CFG_VAR = $(CFG) ++ifeq ($(CFG_VAR),) ++CFG_VAR=release ++endif ++ ++CFLAGS = -MD -nologo -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -D_BIND_TO_CURRENT_VCLIBS_VERSION -D_MT -I../pixman -I. -I../ ++TEST_LDADD = ../pixman/$(CFG_VAR)/pixman-1.lib ++INCLUDES = -I../pixman -I$(top_builddir)/pixman ++ ++# optimization flags ++ifeq ($(CFG_VAR),debug) ++CFLAGS += -Od -Zi ++else ++CFLAGS += -O2 ++endif ++ ++SOURCES = \ ++ a1-trap-test.c \ ++ pdf-op-test.c \ ++ region-test.c \ ++ region-translate-test.c \ ++ fetch-test.c \ ++ oob-test.c \ ++ trap-crasher.c \ ++ alpha-loop.c \ ++ scaling-crash-test.c \ ++ gradient-crash-test.c \ ++ alphamap.c \ ++ stress-test.c \ ++ composite-traps-test.c \ ++ blitters-test.c \ ++ scaling-test.c \ ++ affine-test.c \ ++ composite.c \ ++ utils.c ++ ++TESTS = \ ++ $(CFG_VAR)/a1-trap-test.exe \ ++ $(CFG_VAR)/pdf-op-test.exe \ ++ $(CFG_VAR)/region-test.exe \ ++ $(CFG_VAR)/region-translate-test.exe \ ++ $(CFG_VAR)/fetch-test.exe \ ++ $(CFG_VAR)/oob-test.exe \ ++ $(CFG_VAR)/trap-crasher.exe \ ++ $(CFG_VAR)/alpha-loop.exe \ ++ $(CFG_VAR)/scaling-crash-test.exe \ ++ $(CFG_VAR)/gradient-crash-test.exe \ ++ $(CFG_VAR)/alphamap.exe \ ++ $(CFG_VAR)/stress-test.exe \ ++ $(CFG_VAR)/composite-traps-test.exe \ ++ $(CFG_VAR)/blitters-test.exe \ ++ $(CFG_VAR)/scaling-test.exe \ ++ $(CFG_VAR)/affine-test.exe \ ++ $(CFG_VAR)/composite.exe ++ ++ ++OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(SOURCES)) ++ ++$(CFG_VAR)/%.obj: %.c ++ @mkdir -p $(CFG_VAR) ++ @$(CC) -c $(CFLAGS) -Fo"$@" $< ++ ++$(CFG_VAR)/%.exe: $(CFG_VAR)/%.obj ++ $(LINK) /NOLOGO /OUT:$@ $< $(CFG_VAR)/utils.obj $(TEST_LDADD) ++ ++all: $(OBJECTS) $(TESTS) ++ @exit 0 ++ ++clean: ++ @rm -f $(CFG_VAR)/*.obj $(CFG_VAR)/*.pdb || exit 0 +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0005-Do-not-include-unused-headers.patch b/recipes/xorg-lib/pixman-0.21.6/0005-Do-not-include-unused-headers.patch new file mode 100644 index 0000000000..60f9528aa4 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0005-Do-not-include-unused-headers.patch @@ -0,0 +1,40 @@ +From 8868778ea1fdc8e70da76b3b00ea78106c5840d8 Mon Sep 17 00:00:00 2001 +From: Andrea Canciani <ranma42@gmail.com> +Date: Tue, 22 Feb 2011 22:43:48 +0100 +Subject: [PATCH 05/40] Do not include unused headers + +pixman-combine32.h is included without being used both in +pixman-image.c and in pixman-general.c. +--- + pixman/pixman-general.c | 2 -- + pixman/pixman-image.c | 1 - + 2 files changed, 0 insertions(+), 3 deletions(-) + +diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c +index 16ea3a4..872fb7e 100644 +--- a/pixman/pixman-general.c ++++ b/pixman/pixman-general.c +@@ -36,8 +36,6 @@ + #include <stdlib.h> + #include <string.h> + #include "pixman-private.h" +-#include "pixman-combine32.h" +-#include "pixman-private.h" + + static void + general_src_iter_init (pixman_implementation_t *imp, +diff --git a/pixman/pixman-image.c b/pixman/pixman-image.c +index 9103ca6..84bacf8 100644 +--- a/pixman/pixman-image.c ++++ b/pixman/pixman-image.c +@@ -30,7 +30,6 @@ + #include <assert.h> + + #include "pixman-private.h" +-#include "pixman-combine32.h" + + pixman_bool_t + _pixman_init_gradient (gradient_t * gradient, +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0006-test-Silence-MSVC-warnings.patch b/recipes/xorg-lib/pixman-0.21.6/0006-test-Silence-MSVC-warnings.patch new file mode 100644 index 0000000000..80d7943977 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0006-test-Silence-MSVC-warnings.patch @@ -0,0 +1,63 @@ +From 9ebde285fa990bfa1524f166fbfb1368c346b14a Mon Sep 17 00:00:00 2001 +From: Andrea Canciani <ranma42@gmail.com> +Date: Thu, 24 Feb 2011 12:53:39 +0100 +Subject: [PATCH 06/40] test: Silence MSVC warnings + +MSVC does not notice non-returning functions (abort() / assert(0)) +and warns about paths which end with them in non-void functions: + +c:\cygwin\home\ranma42\code\fdo\pixman\test\fetch-test.c(114) : +warning C4715: 'reader' : not all control paths return a value +c:\cygwin\home\ranma42\code\fdo\pixman\test\stress-test.c(133) : +warning C4715: 'real_reader' : not all control paths return a value +c:\cygwin\home\ranma42\code\fdo\pixman\test\composite.c(431) : +warning C4715: 'calc_op' : not all control paths return a value + +These warnings can be silenced by adding a return after the +termination call. +--- + test/composite.c | 1 + + test/fetch-test.c | 1 + + test/stress-test.c | 2 +- + 3 files changed, 3 insertions(+), 1 deletions(-) + +diff --git a/test/composite.c b/test/composite.c +index 08c6689..a86e5ed 100644 +--- a/test/composite.c ++++ b/test/composite.c +@@ -426,6 +426,7 @@ calc_op (pixman_op_t op, double src, double dst, double srca, double dsta) + case PIXMAN_OP_HSL_LUMINOSITY: + default: + abort(); ++ return 0; /* silence MSVC */ + } + #undef mult_chan + } +diff --git a/test/fetch-test.c b/test/fetch-test.c +index 314a072..60bc765 100644 +--- a/test/fetch-test.c ++++ b/test/fetch-test.c +@@ -110,6 +110,7 @@ reader (const void *src, int size) + return *(uint32_t *)src; + default: + assert(0); ++ return 0; /* silence MSVC */ + } + } + +diff --git a/test/stress-test.c b/test/stress-test.c +index bcbc1f8..166dc6d 100644 +--- a/test/stress-test.c ++++ b/test/stress-test.c +@@ -128,7 +128,7 @@ real_reader (const void *src, int size) + return *(uint32_t *)src; + default: + assert (0); +- break; ++ return 0; /* silence MSVC */ + } + } + +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0007-Main-loop-template-for-fast-single-pass-bilinear-sca.patch b/recipes/xorg-lib/pixman-0.21.6/0007-Main-loop-template-for-fast-single-pass-bilinear-sca.patch new file mode 100644 index 0000000000..c5dab5c31f --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0007-Main-loop-template-for-fast-single-pass-bilinear-sca.patch @@ -0,0 +1,466 @@ +From d506bf68fd0e9a1c5dd484daee70631699918387 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Mon, 21 Feb 2011 01:29:02 +0200 +Subject: [PATCH 07/40] Main loop template for fast single pass bilinear scaling + +Can be used for implementing SIMD optimized fast path +functions which work with bilinear scaled source images. + +Similar to the template for nearest scaling main loop, the +following types of mask are supported: +1. no mask +2. non-scaled a8 mask with SAMPLES_COVER_CLIP flag +3. solid mask + +PAD repeat is fully supported. NONE repeat is partially +supported (right now only works if source image has alpha +channel or when alpha channel of the source image does not +have any effect on the compositing operation). +--- + pixman/pixman-fast-path.h | 432 +++++++++++++++++++++++++++++++++++++++++++++ + 1 files changed, 432 insertions(+), 0 deletions(-) + +diff --git a/pixman/pixman-fast-path.h b/pixman/pixman-fast-path.h +index d081222..1885d47 100644 +--- a/pixman/pixman-fast-path.h ++++ b/pixman/pixman-fast-path.h +@@ -587,4 +587,436 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func), \ + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func) + ++/*****************************************************************************/ ++ ++/* ++ * Identify 5 zones in each scanline for bilinear scaling. Depending on ++ * whether 2 pixels to be interpolated are fetched from the image itself, ++ * from the padding area around it or from both image and padding area. ++ */ ++static force_inline void ++bilinear_pad_repeat_get_scanline_bounds (int32_t source_image_width, ++ pixman_fixed_t vx, ++ pixman_fixed_t unit_x, ++ int32_t * left_pad, ++ int32_t * left_tz, ++ int32_t * width, ++ int32_t * right_tz, ++ int32_t * right_pad) ++{ ++ int width1 = *width, left_pad1, right_pad1; ++ int width2 = *width, left_pad2, right_pad2; ++ ++ pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x, ++ &width1, &left_pad1, &right_pad1); ++ pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1, ++ unit_x, &width2, &left_pad2, &right_pad2); ++ ++ *left_pad = left_pad2; ++ *left_tz = left_pad1 - left_pad2; ++ *right_tz = right_pad2 - right_pad1; ++ *right_pad = right_pad1; ++ *width -= *left_pad + *left_tz + *right_tz + *right_pad; ++} ++ ++/* ++ * Main loop template for single pass bilinear scaling. It needs to be ++ * provided with 'scanline_func' which should do the compositing operation. ++ * The needed function has the following prototype: ++ * ++ * scanline_func (dst_type_t * dst, ++ * const mask_type_ * mask, ++ * const src_type_t * src_top, ++ * const src_type_t * src_bottom, ++ * int32_t width, ++ * int weight_top, ++ * int weight_bottom, ++ * pixman_fixed_t vx, ++ * pixman_fixed_t unit_x, ++ * pixman_fixed_t max_vx, ++ * pixman_bool_t zero_src) ++ * ++ * Where: ++ * dst - destination scanline buffer for storing results ++ * mask - mask buffer (or single value for solid mask) ++ * src_top, src_bottom - two source scanlines ++ * width - number of pixels to process ++ * weight_top - weight of the top row for interpolation ++ * weight_bottom - weight of the bottom row for interpolation ++ * vx - initial position for fetching the first pair of ++ * pixels from the source buffer ++ * unit_x - position increment needed to move to the next pair ++ * of pixels ++ * max_vx - image size as a fixed point value, can be used for ++ * implementing NORMAL repeat (when it is supported) ++ * zero_src - boolean hint variable, which is set to TRUE when ++ * all source pixels are fetched from zero padding ++ * zone for NONE repeat ++ * ++ * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256, ++ * but sometimes it may be less than that for NONE repeat when handling ++ * fuzzy antialiased top or bottom image edges. Also both top and ++ * bottom weight variables are guaranteed to have value in 0-255 ++ * range and can fit into unsigned byte or be used with 8-bit SIMD ++ * multiplication instructions. ++ */ ++#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \ ++ dst_type_t, repeat_mode, have_mask, mask_is_solid) \ ++static void \ ++fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp, \ ++ pixman_op_t op, \ ++ pixman_image_t * src_image, \ ++ pixman_image_t * mask_image, \ ++ pixman_image_t * dst_image, \ ++ int32_t src_x, \ ++ int32_t src_y, \ ++ int32_t mask_x, \ ++ int32_t mask_y, \ ++ int32_t dst_x, \ ++ int32_t dst_y, \ ++ int32_t width, \ ++ int32_t height) \ ++{ \ ++ dst_type_t *dst_line; \ ++ mask_type_t *mask_line; \ ++ src_type_t *src_first_line; \ ++ int y1, y2; \ ++ pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */ \ ++ pixman_vector_t v; \ ++ pixman_fixed_t vx, vy; \ ++ pixman_fixed_t unit_x, unit_y; \ ++ int32_t left_pad, left_tz, right_tz, right_pad; \ ++ \ ++ dst_type_t *dst; \ ++ mask_type_t solid_mask; \ ++ const mask_type_t *mask = &solid_mask; \ ++ int src_stride, mask_stride, dst_stride; \ ++ \ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \ ++ if (have_mask) \ ++ { \ ++ if (mask_is_solid) \ ++ { \ ++ solid_mask = _pixman_image_get_solid (imp, mask_image, dst_image->bits.format); \ ++ mask_stride = 0; \ ++ } \ ++ else \ ++ { \ ++ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t, \ ++ mask_stride, mask_line, 1); \ ++ } \ ++ } \ ++ /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \ ++ * transformed from destination space to source space */ \ ++ PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \ ++ \ ++ /* reference point is the center of the pixel */ \ ++ v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \ ++ v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \ ++ v.vector[2] = pixman_fixed_1; \ ++ \ ++ if (!pixman_transform_point_3d (src_image->common.transform, &v)) \ ++ return; \ ++ \ ++ unit_x = src_image->common.transform->matrix[0][0]; \ ++ unit_y = src_image->common.transform->matrix[1][1]; \ ++ \ ++ v.vector[0] -= pixman_fixed_1 / 2; \ ++ v.vector[1] -= pixman_fixed_1 / 2; \ ++ \ ++ vy = v.vector[1]; \ ++ \ ++ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \ ++ PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ ++ { \ ++ bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x, \ ++ &left_pad, &left_tz, &width, &right_tz, &right_pad); \ ++ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \ ++ { \ ++ /* PAD repeat does not need special handling for 'transition zones' and */ \ ++ /* they can be combined with 'padding zones' safely */ \ ++ left_pad += left_tz; \ ++ right_pad += right_tz; \ ++ left_tz = right_tz = 0; \ ++ } \ ++ v.vector[0] += left_pad * unit_x; \ ++ } \ ++ \ ++ while (--height >= 0) \ ++ { \ ++ int weight1, weight2; \ ++ dst = dst_line; \ ++ dst_line += dst_stride; \ ++ vx = v.vector[0]; \ ++ if (have_mask && !mask_is_solid) \ ++ { \ ++ mask = mask_line; \ ++ mask_line += mask_stride; \ ++ } \ ++ \ ++ y1 = pixman_fixed_to_int (vy); \ ++ weight2 = (vy >> 8) & 0xff; \ ++ if (weight2) \ ++ { \ ++ /* normal case, both row weights are in 0-255 range and fit unsigned byte */ \ ++ y2 = y1 + 1; \ ++ weight1 = 256 - weight2; \ ++ } \ ++ else \ ++ { \ ++ /* set both top and bottom row to the same scanline, and weights to 128+128 */ \ ++ y2 = y1; \ ++ weight1 = weight2 = 128; \ ++ } \ ++ vy += unit_y; \ ++ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \ ++ { \ ++ src_type_t *src1, *src2; \ ++ src_type_t buf1[2]; \ ++ src_type_t buf2[2]; \ ++ repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height); \ ++ repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height); \ ++ src1 = src_first_line + src_stride * y1; \ ++ src2 = src_first_line + src_stride * y2; \ ++ \ ++ if (left_pad > 0) \ ++ { \ ++ buf1[0] = buf1[1] = src1[0]; \ ++ buf2[0] = buf2[1] = src2[0]; \ ++ scanline_func (dst, mask, \ ++ buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE); \ ++ dst += left_pad; \ ++ if (have_mask && !mask_is_solid) \ ++ mask += left_pad; \ ++ } \ ++ if (width > 0) \ ++ { \ ++ scanline_func (dst, mask, \ ++ src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \ ++ dst += width; \ ++ if (have_mask && !mask_is_solid) \ ++ mask += width; \ ++ } \ ++ if (right_pad > 0) \ ++ { \ ++ buf1[0] = buf1[1] = src1[src_image->bits.width - 1]; \ ++ buf2[0] = buf2[1] = src2[src_image->bits.width - 1]; \ ++ scanline_func (dst, mask, \ ++ buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE); \ ++ } \ ++ } \ ++ else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ ++ { \ ++ src_type_t *src1, *src2; \ ++ src_type_t buf1[2]; \ ++ src_type_t buf2[2]; \ ++ /* handle top/bottom zero padding by just setting weights to 0 if needed */ \ ++ if (y1 < 0) \ ++ { \ ++ weight1 = 0; \ ++ y1 = 0; \ ++ } \ ++ if (y1 >= src_image->bits.height) \ ++ { \ ++ weight1 = 0; \ ++ y1 = src_image->bits.height - 1; \ ++ } \ ++ if (y2 < 0) \ ++ { \ ++ weight2 = 0; \ ++ y2 = 0; \ ++ } \ ++ if (y2 >= src_image->bits.height) \ ++ { \ ++ weight2 = 0; \ ++ y2 = src_image->bits.height - 1; \ ++ } \ ++ src1 = src_first_line + src_stride * y1; \ ++ src2 = src_first_line + src_stride * y2; \ ++ \ ++ if (left_pad > 0) \ ++ { \ ++ buf1[0] = buf1[1] = 0; \ ++ buf2[0] = buf2[1] = 0; \ ++ scanline_func (dst, mask, \ ++ buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE); \ ++ dst += left_pad; \ ++ if (have_mask && !mask_is_solid) \ ++ mask += left_pad; \ ++ } \ ++ if (left_tz > 0) \ ++ { \ ++ buf1[0] = 0; \ ++ buf1[1] = src1[0]; \ ++ buf2[0] = 0; \ ++ buf2[1] = src2[0]; \ ++ scanline_func (dst, mask, \ ++ buf1, buf2, left_tz, weight1, weight2, \ ++ pixman_fixed_frac (vx), unit_x, 0, FALSE); \ ++ dst += left_tz; \ ++ if (have_mask && !mask_is_solid) \ ++ mask += left_tz; \ ++ vx += left_tz * unit_x; \ ++ } \ ++ if (width > 0) \ ++ { \ ++ scanline_func (dst, mask, \ ++ src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \ ++ dst += width; \ ++ if (have_mask && !mask_is_solid) \ ++ mask += width; \ ++ vx += width * unit_x; \ ++ } \ ++ if (right_tz > 0) \ ++ { \ ++ buf1[0] = src1[src_image->bits.width - 1]; \ ++ buf1[1] = 0; \ ++ buf2[0] = src2[src_image->bits.width - 1]; \ ++ buf2[1] = 0; \ ++ scanline_func (dst, mask, \ ++ buf1, buf2, right_tz, weight1, weight2, \ ++ pixman_fixed_frac (vx), unit_x, 0, FALSE); \ ++ dst += right_tz; \ ++ if (have_mask && !mask_is_solid) \ ++ mask += right_tz; \ ++ } \ ++ if (right_pad > 0) \ ++ { \ ++ buf1[0] = buf1[1] = 0; \ ++ buf2[0] = buf2[1] = 0; \ ++ scanline_func (dst, mask, \ ++ buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE); \ ++ } \ ++ } \ ++ else \ ++ { \ ++ scanline_func (dst, mask, src_first_line + src_stride * y1, \ ++ src_first_line + src_stride * y2, width, \ ++ weight1, weight2, vx, unit_x, max_vx, FALSE); \ ++ } \ ++ } \ ++} ++ ++/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */ ++#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, \ ++ dst_type_t, repeat_mode, have_mask, mask_is_solid) \ ++ FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\ ++ dst_type_t, repeat_mode, have_mask, mask_is_solid) ++ ++#define SCALED_BILINEAR_FLAGS \ ++ (FAST_PATH_SCALE_TRANSFORM | \ ++ FAST_PATH_NO_ALPHA_MAP | \ ++ FAST_PATH_BILINEAR_FILTER | \ ++ FAST_PATH_NO_ACCESSORS | \ ++ FAST_PATH_NARROW_FORMAT) ++ ++#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func) \ ++ { PIXMAN_OP_ ## op, \ ++ PIXMAN_ ## s, \ ++ (SCALED_BILINEAR_FLAGS | \ ++ FAST_PATH_PAD_REPEAT | \ ++ FAST_PATH_X_UNIT_POSITIVE), \ ++ PIXMAN_null, 0, \ ++ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ ++ fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op, \ ++ } ++ ++#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func) \ ++ { PIXMAN_OP_ ## op, \ ++ PIXMAN_ ## s, \ ++ (SCALED_BILINEAR_FLAGS | \ ++ FAST_PATH_NONE_REPEAT | \ ++ FAST_PATH_X_UNIT_POSITIVE), \ ++ PIXMAN_null, 0, \ ++ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ ++ fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op, \ ++ } ++ ++#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func) \ ++ { PIXMAN_OP_ ## op, \ ++ PIXMAN_ ## s, \ ++ SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \ ++ PIXMAN_null, 0, \ ++ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ ++ fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op, \ ++ } ++ ++#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func) \ ++ { PIXMAN_OP_ ## op, \ ++ PIXMAN_ ## s, \ ++ (SCALED_BILINEAR_FLAGS | \ ++ FAST_PATH_PAD_REPEAT | \ ++ FAST_PATH_X_UNIT_POSITIVE), \ ++ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \ ++ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ ++ fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op, \ ++ } ++ ++#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func) \ ++ { PIXMAN_OP_ ## op, \ ++ PIXMAN_ ## s, \ ++ (SCALED_BILINEAR_FLAGS | \ ++ FAST_PATH_NONE_REPEAT | \ ++ FAST_PATH_X_UNIT_POSITIVE), \ ++ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \ ++ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ ++ fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op, \ ++ } ++ ++#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func) \ ++ { PIXMAN_OP_ ## op, \ ++ PIXMAN_ ## s, \ ++ SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \ ++ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \ ++ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ ++ fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op, \ ++ } ++ ++#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func) \ ++ { PIXMAN_OP_ ## op, \ ++ PIXMAN_ ## s, \ ++ (SCALED_BILINEAR_FLAGS | \ ++ FAST_PATH_PAD_REPEAT | \ ++ FAST_PATH_X_UNIT_POSITIVE), \ ++ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \ ++ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ ++ fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op, \ ++ } ++ ++#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func) \ ++ { PIXMAN_OP_ ## op, \ ++ PIXMAN_ ## s, \ ++ (SCALED_BILINEAR_FLAGS | \ ++ FAST_PATH_NONE_REPEAT | \ ++ FAST_PATH_X_UNIT_POSITIVE), \ ++ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \ ++ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ ++ fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op, \ ++ } ++ ++#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func) \ ++ { PIXMAN_OP_ ## op, \ ++ PIXMAN_ ## s, \ ++ SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \ ++ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \ ++ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \ ++ fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op, \ ++ } ++ ++/* Prefer the use of 'cover' variant, because it is faster */ ++#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func) \ ++ SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func), \ ++ SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func), \ ++ SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func) ++ ++#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func) \ ++ SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func), \ ++ SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func), \ ++ SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func) ++ ++#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func) \ ++ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func), \ ++ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func), \ ++ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func) ++ + #endif +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0008-test-check-correctness-of-bilinear_pad_repeat_get_sc.patch b/recipes/xorg-lib/pixman-0.21.6/0008-test-check-correctness-of-bilinear_pad_repeat_get_sc.patch new file mode 100644 index 0000000000..18dfcaa10f --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0008-test-check-correctness-of-bilinear_pad_repeat_get_sc.patch @@ -0,0 +1,136 @@ +From 0df43b8ae5031dd83775d00b57b6bed809db0e89 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Mon, 21 Feb 2011 02:07:09 +0200 +Subject: [PATCH 08/40] test: check correctness of 'bilinear_pad_repeat_get_scanline_bounds' + +Individual correctness check for the new bilinear scaling related +supplementary function. This test program uses a bit wider range +of input arguments, not covered by other tests. +--- + test/Makefile.am | 2 + + test/scaling-helpers-test.c | 93 +++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 95 insertions(+), 0 deletions(-) + create mode 100644 test/scaling-helpers-test.c + +diff --git a/test/Makefile.am b/test/Makefile.am +index 057e9ce..9dc7219 100644 +--- a/test/Makefile.am ++++ b/test/Makefile.am +@@ -13,6 +13,7 @@ TESTPROGRAMS = \ + trap-crasher \ + alpha-loop \ + scaling-crash-test \ ++ scaling-helpers-test \ + gradient-crash-test \ + alphamap \ + stress-test \ +@@ -33,6 +34,7 @@ alpha_loop_SOURCES = alpha-loop.c utils.c utils.h + composite_SOURCES = composite.c utils.c utils.h + gradient_crash_test_SOURCES = gradient-crash-test.c utils.c utils.h + stress_test_SOURCES = stress-test.c utils.c utils.h ++scaling_helpers_test_SOURCES = scaling-helpers-test.c utils.c utils.h + + # Benchmarks + +diff --git a/test/scaling-helpers-test.c b/test/scaling-helpers-test.c +new file mode 100644 +index 0000000..c186138 +--- /dev/null ++++ b/test/scaling-helpers-test.c +@@ -0,0 +1,93 @@ ++#include <config.h> ++#include <stdint.h> ++#include <stdlib.h> ++#include <stdio.h> ++#include <assert.h> ++#include "utils.h" ++#include "pixman-fast-path.h" ++ ++/* A trivial reference implementation for ++ * 'bilinear_pad_repeat_get_scanline_bounds' ++ */ ++static void ++bilinear_pad_repeat_get_scanline_bounds_ref (int32_t source_image_width, ++ pixman_fixed_t vx_, ++ pixman_fixed_t unit_x, ++ int32_t * left_pad, ++ int32_t * left_tz, ++ int32_t * width, ++ int32_t * right_tz, ++ int32_t * right_pad) ++{ ++ int w = *width; ++ *left_pad = 0; ++ *left_tz = 0; ++ *width = 0; ++ *right_tz = 0; ++ *right_pad = 0; ++ int64_t vx = vx_; ++ while (--w >= 0) ++ { ++ if (vx < 0) ++ { ++ if (vx + pixman_fixed_1 < 0) ++ *left_pad += 1; ++ else ++ *left_tz += 1; ++ } ++ else if (vx + pixman_fixed_1 >= pixman_int_to_fixed (source_image_width)) ++ { ++ if (vx >= pixman_int_to_fixed (source_image_width)) ++ *right_pad += 1; ++ else ++ *right_tz += 1; ++ } ++ else ++ { ++ *width += 1; ++ } ++ vx += unit_x; ++ } ++} ++ ++int ++main (void) ++{ ++ int i; ++ for (i = 0; i < 10000; i++) ++ { ++ int32_t left_pad1, left_tz1, width1, right_tz1, right_pad1; ++ int32_t left_pad2, left_tz2, width2, right_tz2, right_pad2; ++ pixman_fixed_t vx = lcg_rand_N(10000 << 16) - (3000 << 16); ++ int32_t width = lcg_rand_N(10000); ++ int32_t source_image_width = lcg_rand_N(10000) + 1; ++ pixman_fixed_t unit_x = lcg_rand_N(10 << 16) + 1; ++ width1 = width2 = width; ++ ++ bilinear_pad_repeat_get_scanline_bounds_ref (source_image_width, ++ vx, ++ unit_x, ++ &left_pad1, ++ &left_tz1, ++ &width1, ++ &right_tz1, ++ &right_pad1); ++ ++ bilinear_pad_repeat_get_scanline_bounds (source_image_width, ++ vx, ++ unit_x, ++ &left_pad2, ++ &left_tz2, ++ &width2, ++ &right_tz2, ++ &right_pad2); ++ ++ assert (left_pad1 == left_pad2); ++ assert (left_tz1 == left_tz2); ++ assert (width1 == width2); ++ assert (right_tz1 == right_tz2); ++ assert (right_pad1 == right_pad2); ++ } ++ ++ return 0; ++} +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch b/recipes/xorg-lib/pixman-0.21.6/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch new file mode 100644 index 0000000000..b85f78169c --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch @@ -0,0 +1,156 @@ +From 350029396d911941591149cc82b5e68a78ad6747 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Mon, 21 Feb 2011 20:18:02 +0200 +Subject: [PATCH 09/40] SSE2 optimization for bilinear scaled 'src_8888_8888' + +A primitive naive implementation of bilinear scaling using SSE2 intrinsics, +which only handles one pixel at a time. It is approximately 2x faster than +pixman general compositing path. Single pass processing without intermediate +temporary buffer contributes to ~15% and loop unrolling contributes to ~20% +of this speedup. + +Benchmark on Intel Core i7 (x86-64): + Using cairo-perf-trace: + before: image firefox-planet-gnome 12.566 12.610 0.23% 6/6 + after: image firefox-planet-gnome 10.961 11.013 0.19% 5/6 + + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=70.48 MPix/s + after: op=1, src=20028888, dst=20028888, speed=165.38 MPix/s +--- + pixman/pixman-sse2.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 files changed, 112 insertions(+), 0 deletions(-) + +diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c +index 88287b4..696005f 100644 +--- a/pixman/pixman-sse2.c ++++ b/pixman/pixman-sse2.c +@@ -5567,6 +5567,114 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) + ++static void ++bilinear_interpolate_line_sse2 (uint32_t * out, ++ const uint32_t * top, ++ const uint32_t * bottom, ++ int wt, ++ int wb, ++ pixman_fixed_t x, ++ pixman_fixed_t ux, ++ int width) ++{ ++ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); ++ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); ++ const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff); ++ const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); ++ const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux); ++ const __m128i xmm_zero = _mm_setzero_si128 (); ++ __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x); ++ uint32_t pix1, pix2, pix3, pix4; ++ ++ #define INTERPOLATE_ONE_PIXEL(pix) \ ++ do { \ ++ __m128i xmm_wh, xmm_lo, xmm_hi, a; \ ++ /* fetch 2x2 pixel block into sse2 register */ \ ++ uint32_t tl = top [pixman_fixed_to_int (x)]; \ ++ uint32_t tr = top [pixman_fixed_to_int (x) + 1]; \ ++ uint32_t bl = bottom [pixman_fixed_to_int (x)]; \ ++ uint32_t br = bottom [pixman_fixed_to_int (x) + 1]; \ ++ a = _mm_set_epi32 (tr, tl, br, bl); \ ++ x += ux; \ ++ /* vertical interpolation */ \ ++ a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \ ++ xmm_wt), \ ++ _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \ ++ xmm_wb)); \ ++ /* calculate horizontal weights */ \ ++ xmm_wh = _mm_add_epi16 (xmm_addc, \ ++ _mm_xor_si128 (xmm_xorc, \ ++ _mm_srli_epi16 (xmm_x, 8))); \ ++ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ ++ /* horizontal interpolation */ \ ++ xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ ++ xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ ++ a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ ++ _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ ++ /* shift and pack the result */ \ ++ a = _mm_srli_epi32 (a, 16); \ ++ a = _mm_packs_epi32 (a, a); \ ++ a = _mm_packus_epi16 (a, a); \ ++ pix = _mm_cvtsi128_si32 (a); \ ++ } while (0) ++ ++ while ((width -= 4) >= 0) ++ { ++ INTERPOLATE_ONE_PIXEL (pix1); ++ INTERPOLATE_ONE_PIXEL (pix2); ++ INTERPOLATE_ONE_PIXEL (pix3); ++ INTERPOLATE_ONE_PIXEL (pix4); ++ *out++ = pix1; ++ *out++ = pix2; ++ *out++ = pix3; ++ *out++ = pix4; ++ } ++ if (width & 2) ++ { ++ INTERPOLATE_ONE_PIXEL (pix1); ++ INTERPOLATE_ONE_PIXEL (pix2); ++ *out++ = pix1; ++ *out++ = pix2; ++ } ++ if (width & 1) ++ { ++ INTERPOLATE_ONE_PIXEL (pix1); ++ *out = pix1; ++ } ++ ++ #undef INTERPOLATE_ONE_PIXEL ++} ++ ++static force_inline void ++scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, ++ const uint32_t * mask, ++ const uint32_t * src_top, ++ const uint32_t * src_bottom, ++ int32_t w, ++ int wt, ++ int wb, ++ pixman_fixed_t vx, ++ pixman_fixed_t unit_x, ++ pixman_fixed_t max_vx, ++ pixman_bool_t zero_src) ++{ ++ bilinear_interpolate_line_sse2 (dst, src_top, src_bottom, ++ wt, wb, vx, unit_x, w); ++} ++ ++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, ++ uint32_t, uint32_t, uint32_t, ++ COVER, FALSE, FALSE) ++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, ++ uint32_t, uint32_t, uint32_t, ++ PAD, FALSE, FALSE) ++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, ++ uint32_t, uint32_t, uint32_t, ++ NONE, FALSE, FALSE) ++ + static const pixman_fast_path_t sse2_fast_paths[] = + { + /* PIXMAN_OP_OVER */ +@@ -5668,6 +5776,10 @@ static const pixman_fast_path_t sse2_fast_paths[] = + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), + ++ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), ++ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), ++ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), ++ + { PIXMAN_OP_NONE }, + }; + +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch b/recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch new file mode 100644 index 0000000000..4d411625ae --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch @@ -0,0 +1,288 @@ +From 17feaa9c50bb8521b0366345efe181bd99754957 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Tue, 22 Feb 2011 18:45:03 +0200 +Subject: [PATCH 10/40] ARM: NEON optimization for bilinear scaled 'src_8888_8888' + +Initial NEON optimization for bilinear scaling. Can be probably +improved more. + +Benchmark on ARM Cortex-A8: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=6.70 MPix/s + after: op=1, src=20028888, dst=20028888, speed=44.27 MPix/s +--- + pixman/pixman-arm-neon-asm.S | 197 ++++++++++++++++++++++++++++++++++++++++++ + pixman/pixman-arm-neon.c | 45 ++++++++++ + 2 files changed, 242 insertions(+), 0 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index 47daf45..c168e10 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2391,3 +2391,200 @@ generate_composite_function_nearest_scanline \ + 10, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 15 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++/* Supplementary macro for setting function attributes */ ++.macro pixman_asm_function fname ++ .func fname ++ .global fname ++#ifdef __ELF__ ++ .hidden fname ++ .type fname, %function ++#endif ++fname: ++.endm ++ ++.macro bilinear_interpolate_last_pixel ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d0}, [TMP1] ++ vshr.u16 d30, d24, #8 ++ vld1.32 {d1}, [TMP2] ++ vmull.u8 q1, d0, d28 ++ vmlal.u8 q1, d1, d29 ++ /* 5 cycles bubble */ ++ vshll.u16 q0, d2, #8 ++ vmlsl.u16 q0, d2, d30 ++ vmlal.u16 q0, d3, d30 ++ /* 5 cycles bubble */ ++ vshrn.u32 d0, q0, #16 ++ /* 3 cycles bubble */ ++ vmovn.u16 d0, q0 ++ /* 1 cycle bubble */ ++ vst1.32 {d0[0]}, [OUT, :32]! ++.endm ++ ++.macro bilinear_interpolate_two_pixels ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d0}, [TMP1] ++ vld1.32 {d1}, [TMP2] ++ vmull.u8 q1, d0, d28 ++ vmlal.u8 q1, d1, d29 ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d20}, [TMP1] ++ vld1.32 {d21}, [TMP2] ++ vmull.u8 q11, d20, d28 ++ vmlal.u8 q11, d21, d29 ++ vshr.u16 q15, q12, #8 ++ vadd.u16 q12, q12, q13 ++ vshll.u16 q0, d2, #8 ++ vmlsl.u16 q0, d2, d30 ++ vmlal.u16 q0, d3, d30 ++ vshll.u16 q10, d22, #8 ++ vmlsl.u16 q10, d22, d31 ++ vmlal.u16 q10, d23, d31 ++ vshrn.u32 d30, q0, #16 ++ vshrn.u32 d31, q10, #16 ++ vmovn.u16 d0, q15 ++ vst1.32 {d0}, [OUT]! ++.endm ++ ++.macro bilinear_interpolate_four_pixels ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d0}, [TMP1] ++ vld1.32 {d1}, [TMP2] ++ vmull.u8 q1, d0, d28 ++ vmlal.u8 q1, d1, d29 ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d20}, [TMP1] ++ vld1.32 {d21}, [TMP2] ++ vmull.u8 q11, d20, d28 ++ vmlal.u8 q11, d21, d29 ++ vshr.u16 q15, q12, #8 ++ vadd.u16 q12, q12, q13 ++ vshll.u16 q0, d2, #8 ++ vmlsl.u16 q0, d2, d30 ++ vmlal.u16 q0, d3, d30 ++ vshll.u16 q10, d22, #8 ++ vmlsl.u16 q10, d22, d31 ++ vmlal.u16 q10, d23, d31 ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d4}, [TMP1] ++ vld1.32 {d5}, [TMP2] ++ vmull.u8 q3, d4, d28 ++ vmlal.u8 q3, d5, d29 ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d16}, [TMP1] ++ vld1.32 {d17}, [TMP2] ++ vmull.u8 q9, d16, d28 ++ vmlal.u8 q9, d17, d29 ++ vshr.u16 q15, q12, #8 ++ vadd.u16 q12, q12, q13 ++ vshll.u16 q2, d6, #8 ++ vmlsl.u16 q2, d6, d30 ++ vmlal.u16 q2, d7, d30 ++ vshll.u16 q8, d18, #8 ++ vmlsl.u16 q8, d18, d31 ++ vmlal.u16 q8, d19, d31 ++ vshrn.u32 d0, q0, #16 ++ vshrn.u32 d1, q10, #16 ++ vshrn.u32 d4, q2, #16 ++ vshrn.u32 d5, q8, #16 ++ vmovn.u16 d0, q0 ++ vmovn.u16 d1, q2 ++ vst1.32 {d0, d1}, [OUT]! ++.endm ++ ++ ++/* ++ * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t * out, ++ * const uint32_t * top, ++ * const uint32_t * bottom, ++ * int wt, ++ * int wb, ++ * pixman_fixed_t x, ++ * pixman_fixed_t ux, ++ * int width) ++ */ ++ ++pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon ++ OUT .req r0 ++ TOP .req r1 ++ BOTTOM .req r2 ++ WT .req r3 ++ WB .req r4 ++ X .req r5 ++ UX .req r6 ++ WIDTH .req ip ++ TMP1 .req r3 ++ TMP2 .req r4 ++ ++ mov ip, sp ++ push {r4, r5, r6, r7} ++ ldmia ip, {WB, X, UX, WIDTH} ++ ++ cmp WIDTH, #0 ++ ble 3f ++ vdup.u16 q12, X ++ vdup.u16 q13, UX ++ vdup.u8 d28, WT ++ vdup.u8 d29, WB ++ vadd.u16 d25, d25, d26 ++ vadd.u16 q13, q13, q13 ++ ++ subs WIDTH, WIDTH, #4 ++ blt 1f ++0: ++ bilinear_interpolate_four_pixels ++ subs WIDTH, WIDTH, #4 ++ bge 0b ++1: ++ tst WIDTH, #2 ++ beq 2f ++ bilinear_interpolate_two_pixels ++2: ++ tst WIDTH, #1 ++ beq 3f ++ bilinear_interpolate_last_pixel ++3: ++ pop {r4, r5, r6, r7} ++ bx lr ++ ++ .unreq OUT ++ .unreq TOP ++ .unreq BOTTOM ++ .unreq WT ++ .unreq WB ++ .unreq X ++ .unreq UX ++ .unreq WIDTH ++ .unreq TMP1 ++ .unreq TMP2 ++.endfunc +diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c +index 3e0c0d1..c7c0254 100644 +--- a/pixman/pixman-arm-neon.c ++++ b/pixman/pixman-arm-neon.c +@@ -232,6 +232,47 @@ pixman_blt_neon (uint32_t *src_bits, + } + } + ++void ++pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (uint32_t * out, ++ const uint32_t * top, ++ const uint32_t * bottom, ++ int wt, ++ int wb, ++ pixman_fixed_t x, ++ pixman_fixed_t ux, ++ int width); ++ ++static force_inline void ++scaled_bilinear_scanline_neon_8888_8888_SRC (uint32_t * dst, ++ const uint32_t * mask, ++ const uint32_t * src_top, ++ const uint32_t * src_bottom, ++ int32_t w, ++ int wt, ++ int wb, ++ pixman_fixed_t vx, ++ pixman_fixed_t unit_x, ++ pixman_fixed_t max_vx, ++ pixman_bool_t zero_src) ++{ ++ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top, ++ src_bottom, wt, wb, ++ vx, unit_x, w); ++} ++ ++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_cover_SRC, ++ scaled_bilinear_scanline_neon_8888_8888_SRC, ++ uint32_t, uint32_t, uint32_t, ++ COVER, FALSE, FALSE) ++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_pad_SRC, ++ scaled_bilinear_scanline_neon_8888_8888_SRC, ++ uint32_t, uint32_t, uint32_t, ++ PAD, FALSE, FALSE) ++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_none_SRC, ++ scaled_bilinear_scanline_neon_8888_8888_SRC, ++ uint32_t, uint32_t, uint32_t, ++ NONE, FALSE, FALSE) ++ + static const pixman_fast_path_t arm_neon_fast_paths[] = + { + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565), +@@ -343,6 +384,10 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = + PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565), + PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565), + ++ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888), ++ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888), ++ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888), ++ + { PIXMAN_OP_NONE }, + }; + +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0011-test-In-image_endian_swap-use-pixman_image_get_forma.patch b/recipes/xorg-lib/pixman-0.21.6/0011-test-In-image_endian_swap-use-pixman_image_get_forma.patch new file mode 100644 index 0000000000..97618606b1 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0011-test-In-image_endian_swap-use-pixman_image_get_forma.patch @@ -0,0 +1,156 @@ +From 84f3c5a71a2de1a96dcf0c7f9ab0a8ee1b1b158f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?S=C3=B8ren=20Sandmann=20Pedersen?= <ssp@redhat.com> +Date: Mon, 7 Mar 2011 13:45:54 -0500 +Subject: [PATCH 11/40] test: In image_endian_swap() use pixman_image_get_format() to get the bpp. + +There is no reason to pass in the bpp as an argument; it can be gotten +directly from the image. +--- + test/affine-test.c | 6 +++--- + test/blitters-test.c | 4 ++-- + test/composite-traps-test.c | 2 +- + test/scaling-test.c | 6 +++--- + test/utils.c | 9 +++++++-- + test/utils.h | 2 +- + 6 files changed, 17 insertions(+), 12 deletions(-) + +diff --git a/test/affine-test.c b/test/affine-test.c +index b7a1fa6..ed8000c 100644 +--- a/test/affine-test.c ++++ b/test/affine-test.c +@@ -95,8 +95,8 @@ test_composite (int testnum, + dst_img = pixman_image_create_bits ( + dst_fmt, dst_width, dst_height, dstbuf, dst_stride); + +- image_endian_swap (src_img, src_bpp * 8); +- image_endian_swap (dst_img, dst_bpp * 8); ++ image_endian_swap (src_img); ++ image_endian_swap (dst_img); + + pixman_transform_init_identity (&transform); + +@@ -251,7 +251,7 @@ test_composite (int testnum, + dstbuf[i] &= 0xFFFFFF; + } + +- image_endian_swap (dst_img, dst_bpp * 8); ++ image_endian_swap (dst_img); + + if (verbose) + { +diff --git a/test/blitters-test.c b/test/blitters-test.c +index 42181ef..63e7cb3 100644 +--- a/test/blitters-test.c ++++ b/test/blitters-test.c +@@ -61,7 +61,7 @@ create_random_image (pixman_format_code_t *allowed_formats, + pixman_image_set_indexed (img, &(y_palette[PIXMAN_FORMAT_BPP (fmt)])); + } + +- image_endian_swap (img, PIXMAN_FORMAT_BPP (fmt)); ++ image_endian_swap (img); + + if (used_fmt) *used_fmt = fmt; + return img; +@@ -101,7 +101,7 @@ free_random_image (uint32_t initcrc, + /* swap endiannes in order to provide identical results on both big + * and litte endian systems + */ +- image_endian_swap (img, PIXMAN_FORMAT_BPP (fmt)); ++ image_endian_swap (img); + crc32 = compute_crc32 (initcrc, data, stride * height); + } + +diff --git a/test/composite-traps-test.c b/test/composite-traps-test.c +index 8f32778..298537d 100644 +--- a/test/composite-traps-test.c ++++ b/test/composite-traps-test.c +@@ -218,7 +218,7 @@ test_composite (int testnum, + dst_bits[i] &= 0xFFFFFF; + } + +- image_endian_swap (dst_img, dst_bpp * 8); ++ image_endian_swap (dst_img); + + if (verbose) + { +diff --git a/test/scaling-test.c b/test/scaling-test.c +index dbb9d39..82370f7 100644 +--- a/test/scaling-test.c ++++ b/test/scaling-test.c +@@ -140,8 +140,8 @@ test_composite (int testnum, + dst_img = pixman_image_create_bits ( + dst_fmt, dst_width, dst_height, dstbuf, dst_stride); + +- image_endian_swap (src_img, src_bpp * 8); +- image_endian_swap (dst_img, dst_bpp * 8); ++ image_endian_swap (src_img); ++ image_endian_swap (dst_img); + + if (lcg_rand_n (4) > 0) + { +@@ -330,7 +330,7 @@ test_composite (int testnum, + dstbuf[i] &= 0xFFFFFF; + } + +- image_endian_swap (dst_img, dst_bpp * 8); ++ image_endian_swap (dst_img); + + if (verbose) + { +diff --git a/test/utils.c b/test/utils.c +index 2f21398..4bf02e1 100644 +--- a/test/utils.c ++++ b/test/utils.c +@@ -133,11 +133,12 @@ compute_crc32 (uint32_t in_crc32, + /* perform endian conversion of pixel data + */ + void +-image_endian_swap (pixman_image_t *img, int bpp) ++image_endian_swap (pixman_image_t *img) + { + int stride = pixman_image_get_stride (img); + uint32_t *data = pixman_image_get_data (img); + int height = pixman_image_get_height (img); ++ int bpp = PIXMAN_FORMAT_BPP (pixman_image_get_format (img)); + int i, j; + + /* swap bytes only on big endian systems */ +@@ -145,10 +146,13 @@ image_endian_swap (pixman_image_t *img, int bpp) + if (*(volatile uint8_t *)&endian_check_var != 0x12) + return; + ++ if (bpp == 8) ++ return; ++ + for (i = 0; i < height; i++) + { + uint8_t *line_data = (uint8_t *)data + stride * i; +- /* swap bytes only for 16, 24 and 32 bpp for now */ ++ + switch (bpp) + { + case 1: +@@ -208,6 +212,7 @@ image_endian_swap (pixman_image_t *img, int bpp) + } + break; + default: ++ assert (FALSE); + break; + } + } +diff --git a/test/utils.h b/test/utils.h +index 9c7bdb1..a5183f7 100644 +--- a/test/utils.h ++++ b/test/utils.h +@@ -60,7 +60,7 @@ compute_crc32 (uint32_t in_crc32, + /* perform endian conversion of pixel data + */ + void +-image_endian_swap (pixman_image_t *img, int bpp); ++image_endian_swap (pixman_image_t *img); + + /* Allocate memory that is bounded by protected pages, + * so that out-of-bounds access will cause segfaults +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0012-test-Do-endian-swapping-of-the-source-and-destinatio.patch b/recipes/xorg-lib/pixman-0.21.6/0012-test-Do-endian-swapping-of-the-source-and-destinatio.patch new file mode 100644 index 0000000000..9fc4cdde07 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0012-test-Do-endian-swapping-of-the-source-and-destinatio.patch @@ -0,0 +1,36 @@ +From 84e361c8e357e26f299213fbeefe64c73447b116 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?S=C3=B8ren=20Sandmann=20Pedersen?= <ssp@redhat.com> +Date: Fri, 4 Mar 2011 15:51:18 -0500 +Subject: [PATCH 12/40] test: Do endian swapping of the source and destination images. + +Otherwise the test fails on big endian. Fix for bug 34767, reported by +Siarhei Siamashka. +--- + test/composite-traps-test.c | 4 ++++ + 1 files changed, 4 insertions(+), 0 deletions(-) + +diff --git a/test/composite-traps-test.c b/test/composite-traps-test.c +index 298537d..cf30281 100644 +--- a/test/composite-traps-test.c ++++ b/test/composite-traps-test.c +@@ -139,6 +139,8 @@ test_composite (int testnum, + pixman_image_set_source_clipping (src_img, 1); + pixman_region_fini (&clip); + } ++ ++ image_endian_swap (src_img); + } + + /* Create destination image */ +@@ -157,6 +159,8 @@ test_composite (int testnum, + + dst_img = pixman_image_create_bits ( + dst_format, dst_width, dst_height, dst_bits, dst_stride); ++ ++ image_endian_swap (dst_img); + } + + /* Create traps */ +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0013-ARM-use-prefetch-in-nearest-scaled-src_0565_0565.patch b/recipes/xorg-lib/pixman-0.21.6/0013-ARM-use-prefetch-in-nearest-scaled-src_0565_0565.patch new file mode 100644 index 0000000000..9d43404898 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0013-ARM-use-prefetch-in-nearest-scaled-src_0565_0565.patch @@ -0,0 +1,77 @@ +From bb3d1b67fd0f42ae00af811c624ea1c44541034d Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Sun, 6 Mar 2011 16:17:12 +0200 +Subject: [PATCH 13/40] ARM: use prefetch in nearest scaled 'src_0565_0565' + +Benchmark on ARM Cortex-A8 r1p3 @500MHz, 32-bit LPDDR @166MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=10020565, dst=10020565, speed=75.02 MPix/s + after: op=1, src=10020565, dst=10020565, speed=73.63 MPix/s + +Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=10020565, dst=10020565, speed=176.12 MPix/s + after: op=1, src=10020565, dst=10020565, speed=267.50 MPix/s +--- + pixman/pixman-arm-simd-asm.S | 27 +++++++++++++++++++++++++-- + 1 files changed, 25 insertions(+), 2 deletions(-) + +diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S +index 7567700..dd1366d 100644 +--- a/pixman/pixman-arm-simd-asm.S ++++ b/pixman/pixman-arm-simd-asm.S +@@ -348,6 +348,7 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 + TMP1 .req r4 + TMP2 .req r5 + VXMASK .req r6 ++ PF_OFFS .req r7 + + ldr UNIT_X, [sp] + push {r4, r5, r6, r7} +@@ -366,12 +367,33 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 + strh TMP2, [DST], #2 + .endm + ++ /* ++ * stop prefetch before reaching the end of scanline (a good behaving ++ * value selected based on some benchmarks with short scanlines) ++ */ ++ #define PREFETCH_BRAKING_DISTANCE 32 ++ + /* now do the scaling */ + and TMP1, VXMASK, VX, lsr #15 + add VX, VX, UNIT_X +- subs W, #4 ++ subs W, #(8 + PREFETCH_BRAKING_DISTANCE) ++ blt 2f ++ /* set prefetch distance to 80 pixels ahead */ ++ add PF_OFFS, VX, UNIT_X, lsl #6 ++ add PF_OFFS, PF_OFFS, UNIT_X, lsl #4 ++1: /* main loop, process 8 pixels per iteration with prefetch */ ++ subs W, W, #8 ++ add PF_OFFS, UNIT_X, lsl #3 ++ scale_2_pixels ++ scale_2_pixels ++ scale_2_pixels ++ scale_2_pixels ++ pld [SRC, PF_OFFS, lsr #15] ++ bge 1b ++2: ++ subs W, #(4 - 8 - PREFETCH_BRAKING_DISTANCE) + blt 2f +-1: /* main loop, process 4 pixels per iteration */ ++1: /* process the remaining pixels */ + scale_2_pixels + scale_2_pixels + subs W, W, #4 +@@ -394,6 +416,7 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 + .unreq TMP1 + .unreq TMP2 + .unreq VXMASK ++ .unreq PF_OFFS + /* return */ + pop {r4, r5, r6, r7} + bx lr +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0014-ARM-common-macro-for-nearest-scaling-fast-paths.patch b/recipes/xorg-lib/pixman-0.21.6/0014-ARM-common-macro-for-nearest-scaling-fast-paths.patch new file mode 100644 index 0000000000..115d5170c6 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0014-ARM-common-macro-for-nearest-scaling-fast-paths.patch @@ -0,0 +1,131 @@ +From f3e17872f5522e25da8e32de83e62bee8cc198d7 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Mon, 7 Mar 2011 03:10:43 +0200 +Subject: [PATCH 14/40] ARM: common macro for nearest scaling fast paths + +The code of nearest scaled 'src_0565_0565' function was generalized +and moved to a common macro, so that it can be reused for other +fast paths. +--- + pixman/pixman-arm-simd-asm.S | 60 +++++++++++++++++++++++++---------------- + 1 files changed, 36 insertions(+), 24 deletions(-) + +diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S +index dd1366d..a9775e2 100644 +--- a/pixman/pixman-arm-simd-asm.S ++++ b/pixman/pixman-arm-simd-asm.S +@@ -331,15 +331,29 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6 + .endfunc + + /* +- * Note: This function is only using armv4t instructions (not even armv6), ++ * Note: This code is only using armv5te instructions (not even armv6), + * but is scheduled for ARM Cortex-A8 pipeline. So it might need to + * be split into a few variants, tuned for each microarchitecture. + * + * TODO: In order to get good performance on ARM9/ARM11 cores (which don't + * have efficient write combining), it needs to be changed to use 16-byte + * aligned writes using STM instruction. ++ * ++ * Nearest scanline scaler macro template uses the following arguments: ++ * fname - name of the function to generate ++ * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes ++ * t - type suffix for LDR/STR instructions ++ * prefetch_distance - prefetch in the source image by that many ++ * pixels ahead ++ * prefetch_braking_distance - stop prefetching when that many pixels are ++ * remaining before the end of scanline + */ +-pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 ++ ++.macro generate_nearest_scanline_func fname, bpp_shift, t, \ ++ prefetch_distance, \ ++ prefetch_braking_distance ++ ++pixman_asm_function fname + W .req r0 + DST .req r1 + SRC .req r2 +@@ -352,35 +366,29 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 + + ldr UNIT_X, [sp] + push {r4, r5, r6, r7} +- mvn VXMASK, #1 ++ mvn VXMASK, #((1 << bpp_shift) - 1) + + /* define helper macro */ + .macro scale_2_pixels +- ldrh TMP1, [SRC, TMP1] +- and TMP2, VXMASK, VX, lsr #15 ++ ldr&t TMP1, [SRC, TMP1] ++ and TMP2, VXMASK, VX, lsr #(16 - bpp_shift) + add VX, VX, UNIT_X +- strh TMP1, [DST], #2 ++ str&t TMP1, [DST], #(1 << bpp_shift) + +- ldrh TMP2, [SRC, TMP2] +- and TMP1, VXMASK, VX, lsr #15 ++ ldr&t TMP2, [SRC, TMP2] ++ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift) + add VX, VX, UNIT_X +- strh TMP2, [DST], #2 ++ str&t TMP2, [DST], #(1 << bpp_shift) + .endm + +- /* +- * stop prefetch before reaching the end of scanline (a good behaving +- * value selected based on some benchmarks with short scanlines) +- */ +- #define PREFETCH_BRAKING_DISTANCE 32 +- + /* now do the scaling */ +- and TMP1, VXMASK, VX, lsr #15 ++ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift) + add VX, VX, UNIT_X +- subs W, #(8 + PREFETCH_BRAKING_DISTANCE) ++ subs W, W, #(8 + prefetch_braking_distance) + blt 2f +- /* set prefetch distance to 80 pixels ahead */ +- add PF_OFFS, VX, UNIT_X, lsl #6 +- add PF_OFFS, PF_OFFS, UNIT_X, lsl #4 ++ /* calculate prefetch offset */ ++ mov PF_OFFS, #prefetch_distance ++ mla PF_OFFS, UNIT_X, PF_OFFS, VX + 1: /* main loop, process 8 pixels per iteration with prefetch */ + subs W, W, #8 + add PF_OFFS, UNIT_X, lsl #3 +@@ -388,10 +396,10 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 + scale_2_pixels + scale_2_pixels + scale_2_pixels +- pld [SRC, PF_OFFS, lsr #15] ++ pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)] + bge 1b + 2: +- subs W, #(4 - 8 - PREFETCH_BRAKING_DISTANCE) ++ subs W, W, #(4 - 8 - prefetch_braking_distance) + blt 2f + 1: /* process the remaining pixels */ + scale_2_pixels +@@ -404,8 +412,8 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 + scale_2_pixels + 2: + tst W, #1 +- ldrneh TMP1, [SRC, TMP1] +- strneh TMP1, [DST], #2 ++ ldrne&t TMP1, [SRC, TMP1] ++ strne&t TMP1, [DST] + /* cleanup helper macro */ + .purgem scale_2_pixels + .unreq DST +@@ -421,3 +429,7 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 + pop {r4, r5, r6, r7} + bx lr + .endfunc ++.endm ++ ++generate_nearest_scanline_func \ ++ pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0015-ARM-assembly-optimized-nearest-scaled-src_8888_8888.patch b/recipes/xorg-lib/pixman-0.21.6/0015-ARM-assembly-optimized-nearest-scaled-src_8888_8888.patch new file mode 100644 index 0000000000..cc3a68f06c --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0015-ARM-assembly-optimized-nearest-scaled-src_8888_8888.patch @@ -0,0 +1,60 @@ +From 5921c17639fe5fdc595c850e3347281c1c8746ba Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Sun, 6 Mar 2011 22:16:32 +0200 +Subject: [PATCH 15/40] ARM: assembly optimized nearest scaled 'src_8888_8888' + +Benchmark on ARM Cortex-A8 r1p3 @500MHz, 32-bit LPDDR @166MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=44.36 MPix/s + after: op=1, src=20028888, dst=20028888, speed=39.79 MPix/s + +Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=102.36 MPix/s + after: op=1, src=20028888, dst=20028888, speed=163.12 MPix/s +--- + pixman/pixman-arm-simd-asm.S | 3 +++ + pixman/pixman-arm-simd.c | 9 +++++++++ + 2 files changed, 12 insertions(+), 0 deletions(-) + +diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S +index a9775e2..858c690 100644 +--- a/pixman/pixman-arm-simd-asm.S ++++ b/pixman/pixman-arm-simd-asm.S +@@ -433,3 +433,6 @@ pixman_asm_function fname + + generate_nearest_scanline_func \ + pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 ++ ++generate_nearest_scanline_func \ ++ pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32 +diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c +index 6bbc109..a66f8df 100644 +--- a/pixman/pixman-arm-simd.c ++++ b/pixman/pixman-arm-simd.c +@@ -389,6 +389,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888, + + PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC, + uint16_t, uint16_t) ++PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC, ++ uint32_t, uint32_t) + + static const pixman_fast_path_t arm_simd_fast_paths[] = + { +@@ -411,6 +413,13 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = + PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565), + PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565), + ++ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888), ++ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888), ++ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888), ++ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888), ++ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888), ++ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888), ++ + { PIXMAN_OP_NONE }, + }; + +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0016-ARM-new-bilinear-fast-path-template-macro-in-pixman-.patch b/recipes/xorg-lib/pixman-0.21.6/0016-ARM-new-bilinear-fast-path-template-macro-in-pixman-.patch new file mode 100644 index 0000000000..d8559b0c61 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0016-ARM-new-bilinear-fast-path-template-macro-in-pixman-.patch @@ -0,0 +1,130 @@ +From 66f4ee1b3bccf4516433d61dbf2035551a712fa2 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Wed, 9 Mar 2011 10:59:46 +0200 +Subject: [PATCH 16/40] ARM: new bilinear fast path template macro in 'pixman-arm-common.h' + +It can be reused in different ARM NEON bilinear scaling fast path functions. +--- + pixman/pixman-arm-common.h | 45 ++++++++++++++++++++++++++++++++++++++++++++ + pixman/pixman-arm-neon.c | 44 ++---------------------------------------- + 2 files changed, 48 insertions(+), 41 deletions(-) + +diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h +index 9b1322b..c3bf986 100644 +--- a/pixman/pixman-arm-common.h ++++ b/pixman/pixman-arm-common.h +@@ -361,4 +361,49 @@ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func) + ++/*****************************************************************************/ ++ ++#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op, \ ++ src_type, dst_type) \ ++void \ ++pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ ++ dst_type * dst, \ ++ const src_type * top, \ ++ const src_type * bottom, \ ++ int wt, \ ++ int wb, \ ++ pixman_fixed_t x, \ ++ pixman_fixed_t ux, \ ++ int width); \ ++ \ ++static force_inline void \ ++scaled_bilinear_scanline_##cputype##_##name##_##op ( \ ++ dst_type * dst, \ ++ const uint32_t * mask, \ ++ const src_type * src_top, \ ++ const src_type * src_bottom, \ ++ int32_t w, \ ++ int wt, \ ++ int wb, \ ++ pixman_fixed_t vx, \ ++ pixman_fixed_t unit_x, \ ++ pixman_fixed_t max_vx, \ ++ pixman_bool_t zero_src) \ ++{ \ ++ if ((flags & SKIP_ZERO_SRC) && zero_src) \ ++ return; \ ++ pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ ++ dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \ ++} \ ++ \ ++FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ ++ scaled_bilinear_scanline_##cputype##_##name##_##op, \ ++ src_type, uint32_t, dst_type, COVER, FALSE, FALSE) \ ++FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ ++ scaled_bilinear_scanline_##cputype##_##name##_##op, \ ++ src_type, uint32_t, dst_type, NONE, FALSE, FALSE) \ ++FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ ++ scaled_bilinear_scanline_##cputype##_##name##_##op, \ ++ src_type, uint32_t, dst_type, PAD, FALSE, FALSE) ++ + #endif +diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c +index c7c0254..98ad5f2 100644 +--- a/pixman/pixman-arm-neon.c ++++ b/pixman/pixman-arm-neon.c +@@ -127,6 +127,9 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565, + PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565, + OVER, uint16_t, uint16_t) + ++PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC, ++ uint32_t, uint32_t) ++ + void + pixman_composite_src_n_8_asm_neon (int32_t w, + int32_t h, +@@ -232,47 +235,6 @@ pixman_blt_neon (uint32_t *src_bits, + } + } + +-void +-pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (uint32_t * out, +- const uint32_t * top, +- const uint32_t * bottom, +- int wt, +- int wb, +- pixman_fixed_t x, +- pixman_fixed_t ux, +- int width); +- +-static force_inline void +-scaled_bilinear_scanline_neon_8888_8888_SRC (uint32_t * dst, +- const uint32_t * mask, +- const uint32_t * src_top, +- const uint32_t * src_bottom, +- int32_t w, +- int wt, +- int wb, +- pixman_fixed_t vx, +- pixman_fixed_t unit_x, +- pixman_fixed_t max_vx, +- pixman_bool_t zero_src) +-{ +- pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top, +- src_bottom, wt, wb, +- vx, unit_x, w); +-} +- +-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_cover_SRC, +- scaled_bilinear_scanline_neon_8888_8888_SRC, +- uint32_t, uint32_t, uint32_t, +- COVER, FALSE, FALSE) +-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_pad_SRC, +- scaled_bilinear_scanline_neon_8888_8888_SRC, +- uint32_t, uint32_t, uint32_t, +- PAD, FALSE, FALSE) +-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_none_SRC, +- scaled_bilinear_scanline_neon_8888_8888_SRC, +- uint32_t, uint32_t, uint32_t, +- NONE, FALSE, FALSE) +- + static const pixman_fast_path_t arm_neon_fast_paths[] = + { + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565), +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0017-ARM-NEON-common-macro-template-for-bilinear-scanline.patch b/recipes/xorg-lib/pixman-0.21.6/0017-ARM-NEON-common-macro-template-for-bilinear-scanline.patch new file mode 100644 index 0000000000..6efc40f6cb --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0017-ARM-NEON-common-macro-template-for-bilinear-scanline.patch @@ -0,0 +1,271 @@ +From 34098dba6763afd3636a14f9c2a079ab08f23b2d Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Wed, 9 Mar 2011 11:34:15 +0200 +Subject: [PATCH 17/40] ARM: NEON: common macro template for bilinear scanline scalers + +This allows to generate bilinear scanline scaling functions targeting +various source and destination color formats. Right now a8r8g8b8/x8r8g8b8 +and r5g6b5 color formats are supported. More formats can be added if needed. +--- + pixman/pixman-arm-neon-asm.S | 222 ++++++++++++++++++++++++++++++++++++++++++ + pixman/pixman-arm-neon-asm.h | 17 +++ + 2 files changed, 239 insertions(+), 0 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index c168e10..f3784f5 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2588,3 +2588,225 @@ pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon + .unreq TMP1 + .unreq TMP2 + .endfunc ++ ++.purgem bilinear_interpolate_last_pixel ++.purgem bilinear_interpolate_two_pixels ++.purgem bilinear_interpolate_four_pixels ++ ++/* ++ * Bilinear scaling support code which tries to provide pixel fetching, color ++ * format conversion, and interpolation as separate macros which can be used ++ * as the basic building blocks for constructing bilinear scanline functions. ++ */ ++ ++.macro bilinear_load_8888 reg1, reg2, tmp ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP2, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {reg1}, [TMP1] ++ vld1.32 {reg2}, [TMP2] ++.endm ++ ++.macro bilinear_load_0565 reg1, reg2, tmp ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP2, asl #1 ++ add TMP2, BOTTOM, TMP2, asl #1 ++ vld1.32 {reg2[0]}, [TMP1] ++ vld1.32 {reg2[1]}, [TMP2] ++ convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp ++.endm ++ ++.macro bilinear_store_8888 numpix, tmp1, tmp2 ++.if numpix == 4 ++ vst1.32 {d0, d1}, [OUT]! ++.elseif numpix == 2 ++ vst1.32 {d0}, [OUT]! ++.elseif numpix == 1 ++ vst1.32 {d0[0]}, [OUT, :32]! ++.else ++ .error bilinear_store_8888 numpix is unsupported ++.endif ++.endm ++ ++.macro bilinear_store_0565 numpix, tmp1, tmp2 ++ vuzp.u8 d0, d1 ++ vuzp.u8 d2, d3 ++ vuzp.u8 d1, d3 ++ vuzp.u8 d0, d2 ++ convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 ++.if numpix == 4 ++ vst1.16 {d2}, [OUT]! ++.elseif numpix == 2 ++ vst1.32 {d2[0]}, [OUT]! ++.elseif numpix == 1 ++ vst1.16 {d2[0]}, [OUT]! ++.else ++ .error bilinear_store_0565 numpix is unsupported ++.endif ++.endm ++ ++.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt ++ bilinear_load_&src_fmt d0, d1, d2 ++ vmull.u8 q1, d0, d28 ++ vmlal.u8 q1, d1, d29 ++ vshr.u16 d30, d24, #8 ++ /* 4 cycles bubble */ ++ vshll.u16 q0, d2, #8 ++ vmlsl.u16 q0, d2, d30 ++ vmlal.u16 q0, d3, d30 ++ /* 5 cycles bubble */ ++ vshrn.u32 d0, q0, #16 ++ /* 3 cycles bubble */ ++ vmovn.u16 d0, q0 ++ /* 1 cycle bubble */ ++ bilinear_store_&dst_fmt 1, q2, q3 ++.endm ++ ++.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt ++ bilinear_load_&src_fmt d0, d1, d2 ++ vmull.u8 q1, d0, d28 ++ vmlal.u8 q1, d1, d29 ++ bilinear_load_&src_fmt d20, d21, d22 ++ vmull.u8 q11, d20, d28 ++ vmlal.u8 q11, d21, d29 ++ vshr.u16 q15, q12, #8 ++ vadd.u16 q12, q12, q13 ++ vshll.u16 q0, d2, #8 ++ vmlsl.u16 q0, d2, d30 ++ vmlal.u16 q0, d3, d30 ++ vshll.u16 q10, d22, #8 ++ vmlsl.u16 q10, d22, d31 ++ vmlal.u16 q10, d23, d31 ++ vshrn.u32 d30, q0, #16 ++ vshrn.u32 d31, q10, #16 ++ vmovn.u16 d0, q15 ++ bilinear_store_&dst_fmt 2, q2, q3 ++.endm ++ ++.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_load_&src_fmt d0, d1, d2 ++ vmull.u8 q1, d0, d28 ++ vmlal.u8 q1, d1, d29 ++ bilinear_load_&src_fmt d20, d21, d22 ++ vmull.u8 q11, d20, d28 ++ vmlal.u8 q11, d21, d29 ++ bilinear_load_&src_fmt d4, d5, d6 ++ vmull.u8 q3, d4, d28 ++ vmlal.u8 q3, d5, d29 ++ bilinear_load_&src_fmt d16, d17, d18 ++ vmull.u8 q9, d16, d28 ++ vmlal.u8 q9, d17, d29 ++ pld [TMP1, PF_OFFS] ++ vshr.u16 q15, q12, #8 ++ vadd.u16 q12, q12, q13 ++ vshll.u16 q0, d2, #8 ++ vmlsl.u16 q0, d2, d30 ++ vmlal.u16 q0, d3, d30 ++ vshll.u16 q10, d22, #8 ++ vmlsl.u16 q10, d22, d31 ++ vmlal.u16 q10, d23, d31 ++ vshr.u16 q15, q12, #8 ++ vshll.u16 q2, d6, #8 ++ vmlsl.u16 q2, d6, d30 ++ vmlal.u16 q2, d7, d30 ++ vshll.u16 q8, d18, #8 ++ pld [TMP2, PF_OFFS] ++ vmlsl.u16 q8, d18, d31 ++ vmlal.u16 q8, d19, d31 ++ vadd.u16 q12, q12, q13 ++ vshrn.u32 d0, q0, #16 ++ vshrn.u32 d1, q10, #16 ++ vshrn.u32 d4, q2, #16 ++ vshrn.u32 d5, q8, #16 ++ vmovn.u16 d0, q0 ++ vmovn.u16 d1, q2 ++ bilinear_store_&dst_fmt 4, q2, q3 ++.endm ++ ++/* ++ * Main template macro for generating NEON optimized bilinear scanline ++ * functions. ++ * ++ * TODO: use software pipelining and aligned writes to the destination buffer ++ * in order to improve performance ++ * ++ * Bilinear scanline scaler macro template uses the following arguments: ++ * fname - name of the function to generate ++ * src_fmt - source color format (8888 or 0565) ++ * dst_fmt - destination color format (8888 or 0565) ++ * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes ++ * prefetch_distance - prefetch in the source image by that many ++ * pixels ahead ++ */ ++ ++.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ ++ bpp_shift, prefetch_distance ++ ++pixman_asm_function fname ++ OUT .req r0 ++ TOP .req r1 ++ BOTTOM .req r2 ++ WT .req r3 ++ WB .req r4 ++ X .req r5 ++ UX .req r6 ++ WIDTH .req ip ++ TMP1 .req r3 ++ TMP2 .req r4 ++ PF_OFFS .req r7 ++ TMP3 .req r8 ++ TMP4 .req r9 ++ ++ mov ip, sp ++ push {r4, r5, r6, r7, r8, r9} ++ mov PF_OFFS, #prefetch_distance ++ ldmia ip, {WB, X, UX, WIDTH} ++ mul PF_OFFS, PF_OFFS, UX ++ ++ cmp WIDTH, #0 ++ ble 3f ++ ++ vdup.u16 q12, X ++ vdup.u16 q13, UX ++ vdup.u8 d28, WT ++ vdup.u8 d29, WB ++ vadd.u16 d25, d25, d26 ++ vadd.u16 q13, q13, q13 ++ ++ subs WIDTH, WIDTH, #4 ++ blt 1f ++ mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift) ++0: ++ bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ subs WIDTH, WIDTH, #4 ++ bge 0b ++1: ++ tst WIDTH, #2 ++ beq 2f ++ bilinear_interpolate_two_pixels src_fmt, dst_fmt ++2: ++ tst WIDTH, #1 ++ beq 3f ++ bilinear_interpolate_last_pixel src_fmt, dst_fmt ++3: ++ pop {r4, r5, r6, r7, r8, r9} ++ bx lr ++ ++ .unreq OUT ++ .unreq TOP ++ .unreq BOTTOM ++ .unreq WT ++ .unreq WB ++ .unreq X ++ .unreq UX ++ .unreq WIDTH ++ .unreq TMP1 ++ .unreq TMP2 ++ .unreq PF_OFFS ++ .unreq TMP3 ++ .unreq TMP4 ++.endfunc ++ ++.endm +diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h +index 24fa361..97adc6a 100644 +--- a/pixman/pixman-arm-neon-asm.h ++++ b/pixman/pixman-arm-neon-asm.h +@@ -1158,3 +1158,20 @@ fname: + vsri.u16 out, tmp1, #5 + vsri.u16 out, tmp2, #11 + .endm ++ ++/* ++ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels ++ * returned in (out0, out1) registers pair. Requires one temporary ++ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original ++ * value from 'in' is lost ++ */ ++.macro convert_four_0565_to_x888_packed in, out0, out1, tmp ++ vshl.u16 out0, in, #5 /* G top 6 bits */ ++ vshl.u16 tmp, in, #11 /* B top 5 bits */ ++ vsri.u16 in, in, #5 /* R is ready in top bits */ ++ vsri.u16 out0, out0, #6 /* G is ready in top bits */ ++ vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ ++ vshr.u16 out1, in, #8 /* R is in place */ ++ vsri.u16 out0, tmp, #8 /* G & B is in place */ ++ vzip.u16 out0, out1 /* everything is in place */ ++.endm +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch b/recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch new file mode 100644 index 0000000000..245e536716 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch @@ -0,0 +1,226 @@ +From 11a0c5badbc59ce967707ef836313cc98f8aec4e Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Wed, 9 Mar 2011 11:46:48 +0200 +Subject: [PATCH 18/40] ARM: use common macro template for bilinear scaled 'src_8888_8888' + +This is a cleanup for old and now duplicated code. The performance improvement +is mostly coming from the enabled use of software prefetch, but instructions +scheduling is also slightly better. + +Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=53.24 MPix/s + after: op=1, src=20028888, dst=20028888, speed=74.36 MPix/s +--- + pixman/pixman-arm-neon-asm.S | 191 +----------------------------------------- + 1 files changed, 3 insertions(+), 188 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index f3784f5..52dc444 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2405,194 +2405,6 @@ generate_composite_function_nearest_scanline \ + fname: + .endm + +-.macro bilinear_interpolate_last_pixel +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d0}, [TMP1] +- vshr.u16 d30, d24, #8 +- vld1.32 {d1}, [TMP2] +- vmull.u8 q1, d0, d28 +- vmlal.u8 q1, d1, d29 +- /* 5 cycles bubble */ +- vshll.u16 q0, d2, #8 +- vmlsl.u16 q0, d2, d30 +- vmlal.u16 q0, d3, d30 +- /* 5 cycles bubble */ +- vshrn.u32 d0, q0, #16 +- /* 3 cycles bubble */ +- vmovn.u16 d0, q0 +- /* 1 cycle bubble */ +- vst1.32 {d0[0]}, [OUT, :32]! +-.endm +- +-.macro bilinear_interpolate_two_pixels +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add X, X, UX +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d0}, [TMP1] +- vld1.32 {d1}, [TMP2] +- vmull.u8 q1, d0, d28 +- vmlal.u8 q1, d1, d29 +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add X, X, UX +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d20}, [TMP1] +- vld1.32 {d21}, [TMP2] +- vmull.u8 q11, d20, d28 +- vmlal.u8 q11, d21, d29 +- vshr.u16 q15, q12, #8 +- vadd.u16 q12, q12, q13 +- vshll.u16 q0, d2, #8 +- vmlsl.u16 q0, d2, d30 +- vmlal.u16 q0, d3, d30 +- vshll.u16 q10, d22, #8 +- vmlsl.u16 q10, d22, d31 +- vmlal.u16 q10, d23, d31 +- vshrn.u32 d30, q0, #16 +- vshrn.u32 d31, q10, #16 +- vmovn.u16 d0, q15 +- vst1.32 {d0}, [OUT]! +-.endm +- +-.macro bilinear_interpolate_four_pixels +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add X, X, UX +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d0}, [TMP1] +- vld1.32 {d1}, [TMP2] +- vmull.u8 q1, d0, d28 +- vmlal.u8 q1, d1, d29 +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add X, X, UX +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d20}, [TMP1] +- vld1.32 {d21}, [TMP2] +- vmull.u8 q11, d20, d28 +- vmlal.u8 q11, d21, d29 +- vshr.u16 q15, q12, #8 +- vadd.u16 q12, q12, q13 +- vshll.u16 q0, d2, #8 +- vmlsl.u16 q0, d2, d30 +- vmlal.u16 q0, d3, d30 +- vshll.u16 q10, d22, #8 +- vmlsl.u16 q10, d22, d31 +- vmlal.u16 q10, d23, d31 +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add X, X, UX +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d4}, [TMP1] +- vld1.32 {d5}, [TMP2] +- vmull.u8 q3, d4, d28 +- vmlal.u8 q3, d5, d29 +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add X, X, UX +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d16}, [TMP1] +- vld1.32 {d17}, [TMP2] +- vmull.u8 q9, d16, d28 +- vmlal.u8 q9, d17, d29 +- vshr.u16 q15, q12, #8 +- vadd.u16 q12, q12, q13 +- vshll.u16 q2, d6, #8 +- vmlsl.u16 q2, d6, d30 +- vmlal.u16 q2, d7, d30 +- vshll.u16 q8, d18, #8 +- vmlsl.u16 q8, d18, d31 +- vmlal.u16 q8, d19, d31 +- vshrn.u32 d0, q0, #16 +- vshrn.u32 d1, q10, #16 +- vshrn.u32 d4, q2, #16 +- vshrn.u32 d5, q8, #16 +- vmovn.u16 d0, q0 +- vmovn.u16 d1, q2 +- vst1.32 {d0, d1}, [OUT]! +-.endm +- +- +-/* +- * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t * out, +- * const uint32_t * top, +- * const uint32_t * bottom, +- * int wt, +- * int wb, +- * pixman_fixed_t x, +- * pixman_fixed_t ux, +- * int width) +- */ +- +-pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon +- OUT .req r0 +- TOP .req r1 +- BOTTOM .req r2 +- WT .req r3 +- WB .req r4 +- X .req r5 +- UX .req r6 +- WIDTH .req ip +- TMP1 .req r3 +- TMP2 .req r4 +- +- mov ip, sp +- push {r4, r5, r6, r7} +- ldmia ip, {WB, X, UX, WIDTH} +- +- cmp WIDTH, #0 +- ble 3f +- vdup.u16 q12, X +- vdup.u16 q13, UX +- vdup.u8 d28, WT +- vdup.u8 d29, WB +- vadd.u16 d25, d25, d26 +- vadd.u16 q13, q13, q13 +- +- subs WIDTH, WIDTH, #4 +- blt 1f +-0: +- bilinear_interpolate_four_pixels +- subs WIDTH, WIDTH, #4 +- bge 0b +-1: +- tst WIDTH, #2 +- beq 2f +- bilinear_interpolate_two_pixels +-2: +- tst WIDTH, #1 +- beq 3f +- bilinear_interpolate_last_pixel +-3: +- pop {r4, r5, r6, r7} +- bx lr +- +- .unreq OUT +- .unreq TOP +- .unreq BOTTOM +- .unreq WT +- .unreq WB +- .unreq X +- .unreq UX +- .unreq WIDTH +- .unreq TMP1 +- .unreq TMP2 +-.endfunc +- +-.purgem bilinear_interpolate_last_pixel +-.purgem bilinear_interpolate_two_pixels +-.purgem bilinear_interpolate_four_pixels +- + /* + * Bilinear scaling support code which tries to provide pixel fetching, color + * format conversion, and interpolation as separate macros which can be used +@@ -2810,3 +2622,6 @@ pixman_asm_function fname + .endfunc + + .endm ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28 +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0019-ARM-NEON-optimization-for-bilinear-scaled-src_8888_0.patch b/recipes/xorg-lib/pixman-0.21.6/0019-ARM-NEON-optimization-for-bilinear-scaled-src_8888_0.patch new file mode 100644 index 0000000000..cc1769404f --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0019-ARM-NEON-optimization-for-bilinear-scaled-src_8888_0.patch @@ -0,0 +1,51 @@ +From 2ee27e7d79637da9173ee1bf3423e5a81534ccb4 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Wed, 9 Mar 2011 11:53:04 +0200 +Subject: [PATCH 19/40] ARM: NEON optimization for bilinear scaled 'src_8888_0565' + +Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=10020565, speed=6.56 MPix/s + after: op=1, src=20028888, dst=10020565, speed=61.65 MPix/s +--- + pixman/pixman-arm-neon-asm.S | 3 +++ + pixman/pixman-arm-neon.c | 5 +++++ + 2 files changed, 8 insertions(+), 0 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index 52dc444..f0b42ca 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2625,3 +2625,6 @@ pixman_asm_function fname + + generate_bilinear_scanline_func \ + pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28 ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28 +diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c +index 98ad5f2..ba6de66 100644 +--- a/pixman/pixman-arm-neon.c ++++ b/pixman/pixman-arm-neon.c +@@ -129,6 +129,8 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565, + + PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC, + uint32_t, uint32_t) ++PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC, ++ uint32_t, uint16_t) + + void + pixman_composite_src_n_8_asm_neon (int32_t w, +@@ -350,6 +352,9 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = + SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888), + ++ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565), ++ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565), ++ + { PIXMAN_OP_NONE }, + }; + +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0020-ARM-NEON-optimization-for-bilinear-scaled-src_0565_x.patch b/recipes/xorg-lib/pixman-0.21.6/0020-ARM-NEON-optimization-for-bilinear-scaled-src_0565_x.patch new file mode 100644 index 0000000000..1924b3ae02 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0020-ARM-NEON-optimization-for-bilinear-scaled-src_0565_x.patch @@ -0,0 +1,50 @@ +From 29003c3befe2159396d181ef9ac1caaadcabf382 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Wed, 9 Mar 2011 13:21:53 +0200 +Subject: [PATCH 20/40] ARM: NEON optimization for bilinear scaled 'src_0565_x888' + +Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=10020565, dst=20020888, speed=3.39 MPix/s + after: op=1, src=10020565, dst=20020888, speed=36.82 MPix/s +--- + pixman/pixman-arm-neon-asm.S | 3 +++ + pixman/pixman-arm-neon.c | 4 ++++ + 2 files changed, 7 insertions(+), 0 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index f0b42ca..9245db9 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2628,3 +2628,6 @@ generate_bilinear_scanline_func \ + + generate_bilinear_scanline_func \ + pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28 ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28 +diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c +index ba6de66..18e26eb 100644 +--- a/pixman/pixman-arm-neon.c ++++ b/pixman/pixman-arm-neon.c +@@ -131,6 +131,8 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC, + uint32_t, uint32_t) + PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC, + uint32_t, uint16_t) ++PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC, ++ uint16_t, uint32_t) + + void + pixman_composite_src_n_8_asm_neon (int32_t w, +@@ -355,6 +357,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = + SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565), + SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565), + ++ SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888), ++ + { PIXMAN_OP_NONE }, + }; + +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0021-ARM-NEON-optimization-for-bilinear-scaled-src_0565_0.patch b/recipes/xorg-lib/pixman-0.21.6/0021-ARM-NEON-optimization-for-bilinear-scaled-src_0565_0.patch new file mode 100644 index 0000000000..a0193d1fd6 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0021-ARM-NEON-optimization-for-bilinear-scaled-src_0565_0.patch @@ -0,0 +1,49 @@ +From fe99673719091d4a880d031add1369332a75731b Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Wed, 9 Mar 2011 13:27:41 +0200 +Subject: [PATCH 21/40] ARM: NEON optimization for bilinear scaled 'src_0565_0565' + +Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=10020565, dst=10020565, speed=3.30 MPix/s + after: op=1, src=10020565, dst=10020565, speed=32.29 MPix/s +--- + pixman/pixman-arm-neon-asm.S | 3 +++ + pixman/pixman-arm-neon.c | 3 +++ + 2 files changed, 6 insertions(+), 0 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index 9245db9..2b6875b 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2631,3 +2631,6 @@ generate_bilinear_scanline_func \ + + generate_bilinear_scanline_func \ + pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28 ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 28 +diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c +index 18e26eb..0a10ca1 100644 +--- a/pixman/pixman-arm-neon.c ++++ b/pixman/pixman-arm-neon.c +@@ -133,6 +133,8 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC, + uint32_t, uint16_t) + PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC, + uint16_t, uint32_t) ++PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC, ++ uint16_t, uint16_t) + + void + pixman_composite_src_n_8_asm_neon (int32_t w, +@@ -358,6 +360,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = + SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565), + + SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888), ++ SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565), + + { PIXMAN_OP_NONE }, + }; +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0022-ARM-a-bit-faster-NEON-bilinear-scaling-for-r5g6b5-so.patch b/recipes/xorg-lib/pixman-0.21.6/0022-ARM-a-bit-faster-NEON-bilinear-scaling-for-r5g6b5-so.patch new file mode 100644 index 0000000000..20019f45f1 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0022-ARM-a-bit-faster-NEON-bilinear-scaling-for-r5g6b5-so.patch @@ -0,0 +1,166 @@ +From 70a923882ca24664344ba91a649e7aa12c3063f7 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Wed, 9 Mar 2011 13:55:48 +0200 +Subject: [PATCH 22/40] ARM: a bit faster NEON bilinear scaling for r5g6b5 source images + +Instructions scheduling improved in the code responsible for fetching r5g6b5 +pixels and converting them to the intermediate x8r8g8b8 color format used in +the interpolation part of code. Still a lot of NEON stalls are remaining, +which can be resolved later by the use of pipelining. + +Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=10020565, dst=10020565, speed=32.29 MPix/s + op=1, src=10020565, dst=20020888, speed=36.82 MPix/s + after: op=1, src=10020565, dst=10020565, speed=41.35 MPix/s + op=1, src=10020565, dst=20020888, speed=49.16 MPix/s +--- + pixman/pixman-arm-neon-asm.S | 118 +++++++++++++++++++++++++++++++++++------ + 1 files changed, 100 insertions(+), 18 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index 2b6875b..71b30ac 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2430,6 +2430,101 @@ fname: + convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp + .endm + ++.macro bilinear_load_and_vertical_interpolate_two_8888 \ ++ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 ++ ++ bilinear_load_8888 reg1, reg2, tmp1 ++ vmull.u8 acc1, reg1, d28 ++ vmlal.u8 acc1, reg2, d29 ++ bilinear_load_8888 reg3, reg4, tmp2 ++ vmull.u8 acc2, reg3, d28 ++ vmlal.u8 acc2, reg4, d29 ++.endm ++ ++.macro bilinear_load_and_vertical_interpolate_four_8888 \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++ ++ bilinear_load_and_vertical_interpolate_two_8888 \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi ++ bilinear_load_and_vertical_interpolate_two_8888 \ ++ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++.endm ++ ++.macro bilinear_load_and_vertical_interpolate_two_0565 \ ++ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi ++ ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ mov TMP4, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP2, asl #1 ++ add TMP2, BOTTOM, TMP2, asl #1 ++ add TMP3, TOP, TMP4, asl #1 ++ add TMP4, BOTTOM, TMP4, asl #1 ++ vld1.32 {acc2lo[0]}, [TMP1] ++ vld1.32 {acc2hi[0]}, [TMP3] ++ vld1.32 {acc2lo[1]}, [TMP2] ++ vld1.32 {acc2hi[1]}, [TMP4] ++ convert_0565_to_x888 acc2, reg3, reg2, reg1 ++ vzip.u8 reg1, reg3 ++ vzip.u8 reg2, reg4 ++ vzip.u8 reg3, reg4 ++ vzip.u8 reg1, reg2 ++ vmull.u8 acc1, reg1, d28 ++ vmlal.u8 acc1, reg2, d29 ++ vmull.u8 acc2, reg3, d28 ++ vmlal.u8 acc2, reg4, d29 ++.endm ++ ++.macro bilinear_load_and_vertical_interpolate_four_0565 \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++ ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ mov TMP4, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP2, asl #1 ++ add TMP2, BOTTOM, TMP2, asl #1 ++ add TMP3, TOP, TMP4, asl #1 ++ add TMP4, BOTTOM, TMP4, asl #1 ++ vld1.32 {xacc2lo[0]}, [TMP1] ++ vld1.32 {xacc2hi[0]}, [TMP3] ++ vld1.32 {xacc2lo[1]}, [TMP2] ++ vld1.32 {xacc2hi[1]}, [TMP4] ++ convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ mov TMP4, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP2, asl #1 ++ add TMP2, BOTTOM, TMP2, asl #1 ++ add TMP3, TOP, TMP4, asl #1 ++ add TMP4, BOTTOM, TMP4, asl #1 ++ vld1.32 {yacc2lo[0]}, [TMP1] ++ vzip.u8 xreg1, xreg3 ++ vld1.32 {yacc2hi[0]}, [TMP3] ++ vzip.u8 xreg2, xreg4 ++ vld1.32 {yacc2lo[1]}, [TMP2] ++ vzip.u8 xreg3, xreg4 ++ vld1.32 {yacc2hi[1]}, [TMP4] ++ vzip.u8 xreg1, xreg2 ++ convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 ++ vmull.u8 xacc1, xreg1, d28 ++ vzip.u8 yreg1, yreg3 ++ vmlal.u8 xacc1, xreg2, d29 ++ vzip.u8 yreg2, yreg4 ++ vmull.u8 xacc2, xreg3, d28 ++ vzip.u8 yreg3, yreg4 ++ vmlal.u8 xacc2, xreg4, d29 ++ vzip.u8 yreg1, yreg2 ++ vmull.u8 yacc1, yreg1, d28 ++ vmlal.u8 yacc1, yreg2, d29 ++ vmull.u8 yacc2, yreg3, d28 ++ vmlal.u8 yacc2, yreg4, d29 ++.endm ++ + .macro bilinear_store_8888 numpix, tmp1, tmp2 + .if numpix == 4 + vst1.32 {d0, d1}, [OUT]! +@@ -2477,12 +2572,8 @@ fname: + .endm + + .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt +- bilinear_load_&src_fmt d0, d1, d2 +- vmull.u8 q1, d0, d28 +- vmlal.u8 q1, d1, d29 +- bilinear_load_&src_fmt d20, d21, d22 +- vmull.u8 q11, d20, d28 +- vmlal.u8 q11, d21, d29 ++ bilinear_load_and_vertical_interpolate_two_&src_fmt \ ++ q1, q11, d0, d1, d20, d21, d22, d23 + vshr.u16 q15, q12, #8 + vadd.u16 q12, q12, q13 + vshll.u16 q0, d2, #8 +@@ -2498,18 +2589,9 @@ fname: + .endm + + .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt +- bilinear_load_&src_fmt d0, d1, d2 +- vmull.u8 q1, d0, d28 +- vmlal.u8 q1, d1, d29 +- bilinear_load_&src_fmt d20, d21, d22 +- vmull.u8 q11, d20, d28 +- vmlal.u8 q11, d21, d29 +- bilinear_load_&src_fmt d4, d5, d6 +- vmull.u8 q3, d4, d28 +- vmlal.u8 q3, d5, d29 +- bilinear_load_&src_fmt d16, d17, d18 +- vmull.u8 q9, d16, d28 +- vmlal.u8 q9, d17, d29 ++ bilinear_load_and_vertical_interpolate_four_&src_fmt \ ++ q1, q11, d0, d1, d20, d21, d22, d23 \ ++ q3, q9, d4, d5, d16, d17, d18, d19 + pld [TMP1, PF_OFFS] + vshr.u16 q15, q12, #8 + vadd.u16 q12, q12, q13 +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0023-In-delegate_-src-dest-_iter_init-call-delegate-direc.patch b/recipes/xorg-lib/pixman-0.21.6/0023-In-delegate_-src-dest-_iter_init-call-delegate-direc.patch new file mode 100644 index 0000000000..96343f185b --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0023-In-delegate_-src-dest-_iter_init-call-delegate-direc.patch @@ -0,0 +1,54 @@ +From be4eaa0e4f79af38b7b89c5b09ca88d3a88d9396 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?S=C3=B8ren=20Sandmann=20Pedersen?= <ssp@redhat.com> +Date: Sat, 12 Mar 2011 19:06:02 -0500 +Subject: [PATCH 23/40] In delegate_{src,dest}_iter_init() call delegate directly. + +There is no reason to go through +_pixman_implementation_{src,dest}_iter_init(), especially since +_pixman_implementation_src_iter_init() is doing various other checks +that only need to be done once. + +Also call delegate->src_iter_init() directly in pixman-sse2.c +--- + pixman/pixman-implementation.c | 4 ++-- + pixman/pixman-sse2.c | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/pixman/pixman-implementation.c b/pixman/pixman-implementation.c +index adaf9c6..892767e 100644 +--- a/pixman/pixman-implementation.c ++++ b/pixman/pixman-implementation.c +@@ -122,7 +122,7 @@ delegate_src_iter_init (pixman_implementation_t *imp, + uint8_t * buffer, + iter_flags_t flags) + { +- _pixman_implementation_src_iter_init ( ++ imp->delegate->src_iter_init ( + imp->delegate, iter, image, x, y, width, height, buffer, flags); + } + +@@ -137,7 +137,7 @@ delegate_dest_iter_init (pixman_implementation_t *imp, + uint8_t * buffer, + iter_flags_t flags) + { +- _pixman_implementation_dest_iter_init ( ++ imp->delegate->dest_iter_init ( + imp->delegate, iter, image, x, y, width, height, buffer, flags); + } + +diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c +index 696005f..d4a34e9 100644 +--- a/pixman/pixman-sse2.c ++++ b/pixman/pixman-sse2.c +@@ -6013,7 +6013,7 @@ sse2_src_iter_init (pixman_implementation_t *imp, + } + } + +- _pixman_implementation_src_iter_init ( ++ imp->delegate->src_iter_init ( + imp->delegate, iter, image, x, y, width, height, buffer, flags); + } + +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0024-Fill-out-parts-of-iters-in-_pixman_implementation_-s.patch b/recipes/xorg-lib/pixman-0.21.6/0024-Fill-out-parts-of-iters-in-_pixman_implementation_-s.patch new file mode 100644 index 0000000000..44fd38a8da --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0024-Fill-out-parts-of-iters-in-_pixman_implementation_-s.patch @@ -0,0 +1,111 @@ +From 74d0f44b6d6d613d24541b849835da0464cc6fd0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?S=C3=B8ren=20Sandmann=20Pedersen?= <ssp@redhat.com> +Date: Sat, 12 Mar 2011 19:12:35 -0500 +Subject: [PATCH 24/40] Fill out parts of iters in _pixman_implementation_{src,dest}_iter_init() + +This makes _pixman_implementation_{src,dest}_iter_init() responsible +for filling parts of the information in the iterators. Specifically, +the information passed as arguments is stored in the iterator. + +Also add a height field to pixman_iter_t(). +--- + pixman/pixman-general.c | 6 ------ + pixman/pixman-implementation.c | 16 ++++++++++++++++ + pixman/pixman-private.h | 11 ++++++++--- + pixman/pixman-sse2.c | 2 -- + 4 files changed, 24 insertions(+), 11 deletions(-) + +diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c +index 872fb7e..1a0fa7c 100644 +--- a/pixman/pixman-general.c ++++ b/pixman/pixman-general.c +@@ -44,12 +44,6 @@ general_src_iter_init (pixman_implementation_t *imp, + int x, int y, int width, int height, + uint8_t *buffer, iter_flags_t flags) + { +- iter->image = image; +- iter->x = x; +- iter->y = y; +- iter->width = width; +- iter->buffer = (uint32_t *)buffer; +- + if (image->type == SOLID) + { + _pixman_solid_fill_iter_init ( +diff --git a/pixman/pixman-implementation.c b/pixman/pixman-implementation.c +index 892767e..bdd4543 100644 +--- a/pixman/pixman-implementation.c ++++ b/pixman/pixman-implementation.c +@@ -274,6 +274,14 @@ _pixman_implementation_src_iter_init (pixman_implementation_t *imp, + uint8_t *buffer, + iter_flags_t flags) + { ++ iter->image = image; ++ iter->buffer = (uint32_t *)buffer; ++ iter->x = x; ++ iter->y = y; ++ iter->width = width; ++ iter->height = height; ++ iter->flags = flags; ++ + if (!image) + { + iter->get_scanline = get_scanline_null; +@@ -301,6 +309,14 @@ _pixman_implementation_dest_iter_init (pixman_implementation_t *imp, + uint8_t *buffer, + iter_flags_t flags) + { ++ iter->image = image; ++ iter->buffer = (uint32_t *)buffer; ++ iter->x = x; ++ iter->y = y; ++ iter->width = width; ++ iter->height = height; ++ iter->flags = flags; ++ + (*imp->dest_iter_init) ( + imp, iter, image, x, y, width, height, buffer, flags); + } +diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h +index 1473dc4..ea9545f 100644 +--- a/pixman/pixman-private.h ++++ b/pixman/pixman-private.h +@@ -212,14 +212,19 @@ typedef enum + + struct pixman_iter_t + { +- pixman_iter_get_scanline_t get_scanline; +- pixman_iter_write_back_t write_back; +- ++ /* These are initialized by _pixman_implementation_{src,dest}_init */ + pixman_image_t * image; + uint32_t * buffer; + int x, y; + int width; ++ int height; ++ iter_flags_t flags; ++ ++ /* These function pointers are initialized by the implementation */ ++ pixman_iter_get_scanline_t get_scanline; ++ pixman_iter_write_back_t write_back; + ++ /* These fields are scratch data that implementations can use */ + uint8_t * bits; + int stride; + }; +diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c +index d4a34e9..43a6bf2 100644 +--- a/pixman/pixman-sse2.c ++++ b/pixman/pixman-sse2.c +@@ -6004,8 +6004,6 @@ sse2_src_iter_init (pixman_implementation_t *imp, + + iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8; + iter->stride = s; +- iter->width = width; +- iter->buffer = (uint32_t *)buffer; + + iter->get_scanline = f->get_scanline; + return; +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0025-Simplify-the-prototype-for-iterator-initializers.patch b/recipes/xorg-lib/pixman-0.21.6/0025-Simplify-the-prototype-for-iterator-initializers.patch new file mode 100644 index 0000000000..1bfd6b4e92 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0025-Simplify-the-prototype-for-iterator-initializers.patch @@ -0,0 +1,442 @@ +From 6b27768d81c254a4f1d05473157328d5a5d99b9c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?S=C3=B8ren=20Sandmann=20Pedersen?= <ssp@redhat.com> +Date: Sat, 12 Mar 2011 19:42:58 -0500 +Subject: [PATCH 25/40] Simplify the prototype for iterator initializers. + +All of the information previously passed to the iterator initializers +is now available in the iterator itself, so there is no need to pass +it as arguments anymore. +--- + pixman/pixman-bits-image.c | 20 +++++--------- + pixman/pixman-conical-gradient.c | 7 +--- + pixman/pixman-general.c | 52 ++++++++------------------------------ + pixman/pixman-implementation.c | 30 ++++----------------- + pixman/pixman-linear-gradient.c | 16 +++-------- + pixman/pixman-private.h | 40 ++++++----------------------- + pixman/pixman-radial-gradient.c | 7 +--- + pixman/pixman-solid-fill.c | 17 +++++------- + pixman/pixman-sse2.c | 25 +++++++++-------- + 9 files changed, 61 insertions(+), 153 deletions(-) + +diff --git a/pixman/pixman-bits-image.c b/pixman/pixman-bits-image.c +index a865d71..835ecfb 100644 +--- a/pixman/pixman-bits-image.c ++++ b/pixman/pixman-bits-image.c +@@ -1362,12 +1362,9 @@ src_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) + } + + void +-_pixman_bits_image_src_iter_init (pixman_image_t *image, +- pixman_iter_t *iter, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags) ++_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter) + { +- if (flags & ITER_NARROW) ++ if (iter->flags & ITER_NARROW) + iter->get_scanline = src_get_scanline_narrow; + else + iter->get_scanline = src_get_scanline_wide; +@@ -1472,28 +1469,25 @@ dest_write_back_direct (pixman_iter_t *iter) + } + + void +-_pixman_bits_image_dest_iter_init (pixman_image_t *image, +- pixman_iter_t *iter, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags) ++_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter) + { +- if (flags & ITER_NARROW) ++ if (iter->flags & ITER_NARROW) + { + if (((image->common.flags & + (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) == + (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_NO_ACCESSORS)) && + (image->bits.format == PIXMAN_a8r8g8b8 || + (image->bits.format == PIXMAN_x8r8g8b8 && +- (flags & ITER_LOCALIZED_ALPHA)))) ++ (iter->flags & ITER_LOCALIZED_ALPHA)))) + { +- iter->buffer = image->bits.bits + y * image->bits.rowstride + x; ++ iter->buffer = image->bits.bits + iter->y * image->bits.rowstride + iter->x; + + iter->get_scanline = _pixman_iter_get_scanline_noop; + iter->write_back = dest_write_back_direct; + } + else + { +- if ((flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) == ++ if ((iter->flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) == + (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) + { + iter->get_scanline = _pixman_iter_get_scanline_noop; +diff --git a/pixman/pixman-conical-gradient.c b/pixman/pixman-conical-gradient.c +index 9d7d2e8..791d4f3 100644 +--- a/pixman/pixman-conical-gradient.c ++++ b/pixman/pixman-conical-gradient.c +@@ -171,12 +171,9 @@ conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) + } + + void +-_pixman_conical_gradient_iter_init (pixman_image_t *image, +- pixman_iter_t *iter, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags) ++_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter) + { +- if (flags & ITER_NARROW) ++ if (iter->flags & ITER_NARROW) + iter->get_scanline = conical_get_scanline_narrow; + else + iter->get_scanline = conical_get_scanline_wide; +diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c +index 1a0fa7c..727affc 100644 +--- a/pixman/pixman-general.c ++++ b/pixman/pixman-general.c +@@ -38,60 +38,30 @@ + #include "pixman-private.h" + + static void +-general_src_iter_init (pixman_implementation_t *imp, +- pixman_iter_t *iter, +- pixman_image_t *image, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags) ++general_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) + { ++ pixman_image_t *image = iter->image; ++ + if (image->type == SOLID) +- { +- _pixman_solid_fill_iter_init ( +- image, iter, x, y, width, height, buffer, flags); +- } ++ _pixman_solid_fill_iter_init (image, iter); + else if (image->type == LINEAR) +- { +- _pixman_linear_gradient_iter_init ( +- image, iter, x, y, width, height, buffer, flags); +- } ++ _pixman_linear_gradient_iter_init (image, iter); + else if (image->type == RADIAL) +- { +- _pixman_radial_gradient_iter_init ( +- image, iter, x, y, width, height, buffer, flags); +- } ++ _pixman_radial_gradient_iter_init (image, iter); + else if (image->type == CONICAL) +- { +- _pixman_conical_gradient_iter_init ( +- image, iter, x, y, width, height, buffer, flags); +- } ++ _pixman_conical_gradient_iter_init (image, iter); + else if (image->type == BITS) +- { +- _pixman_bits_image_src_iter_init ( +- image, iter, x, y, width, height, buffer, flags); +- } ++ _pixman_bits_image_src_iter_init (image, iter); + else +- { + _pixman_log_error (FUNC, "Pixman bug: unknown image type\n"); +- } + } + + static void +-general_dest_iter_init (pixman_implementation_t *imp, +- pixman_iter_t *iter, +- pixman_image_t *image, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags) ++general_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) + { +- iter->image = image; +- iter->x = x; +- iter->y = y; +- iter->width = width; +- iter->buffer = (uint32_t *)buffer; +- +- if (image->type == BITS) ++ if (iter->image->type == BITS) + { +- _pixman_bits_image_dest_iter_init ( +- image, iter, x, y, width, height, buffer, flags); ++ _pixman_bits_image_dest_iter_init (iter->image, iter); + } + else + { +diff --git a/pixman/pixman-implementation.c b/pixman/pixman-implementation.c +index bdd4543..f1d3f99 100644 +--- a/pixman/pixman-implementation.c ++++ b/pixman/pixman-implementation.c +@@ -113,32 +113,16 @@ delegate_fill (pixman_implementation_t *imp, + + static void + delegate_src_iter_init (pixman_implementation_t *imp, +- pixman_iter_t * iter, +- pixman_image_t * image, +- int x, +- int y, +- int width, +- int height, +- uint8_t * buffer, +- iter_flags_t flags) ++ pixman_iter_t * iter) + { +- imp->delegate->src_iter_init ( +- imp->delegate, iter, image, x, y, width, height, buffer, flags); ++ imp->delegate->src_iter_init (imp->delegate, iter); + } + + static void + delegate_dest_iter_init (pixman_implementation_t *imp, +- pixman_iter_t * iter, +- pixman_image_t * image, +- int x, +- int y, +- int width, +- int height, +- uint8_t * buffer, +- iter_flags_t flags) ++ pixman_iter_t * iter) + { +- imp->delegate->dest_iter_init ( +- imp->delegate, iter, image, x, y, width, height, buffer, flags); ++ imp->delegate->dest_iter_init (imp->delegate, iter); + } + + pixman_implementation_t * +@@ -293,8 +277,7 @@ _pixman_implementation_src_iter_init (pixman_implementation_t *imp, + } + else + { +- (*imp->src_iter_init) ( +- imp, iter, image, x, y, width, height, buffer, flags); ++ (*imp->src_iter_init) (imp, iter); + } + } + +@@ -317,6 +300,5 @@ _pixman_implementation_dest_iter_init (pixman_implementation_t *imp, + iter->height = height; + iter->flags = flags; + +- (*imp->dest_iter_init) ( +- imp, iter, image, x, y, width, height, buffer, flags); ++ (*imp->dest_iter_init) (imp, iter); + } +diff --git a/pixman/pixman-linear-gradient.c b/pixman/pixman-linear-gradient.c +index 07303fc..6e1ea24 100644 +--- a/pixman/pixman-linear-gradient.c ++++ b/pixman/pixman-linear-gradient.c +@@ -233,18 +233,12 @@ linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) + } + + void +-_pixman_linear_gradient_iter_init (pixman_image_t *image, +- pixman_iter_t *iter, +- int x, +- int y, +- int width, +- int height, +- uint8_t *buffer, +- iter_flags_t flags) ++_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter) + { +- if (linear_gradient_is_horizontal (image, x, y, width, height)) ++ if (linear_gradient_is_horizontal ( ++ iter->image, iter->x, iter->y, iter->width, iter->height)) + { +- if (flags & ITER_NARROW) ++ if (iter->flags & ITER_NARROW) + linear_get_scanline_narrow (iter, NULL); + else + linear_get_scanline_wide (iter, NULL); +@@ -253,7 +247,7 @@ _pixman_linear_gradient_iter_init (pixman_image_t *image, + } + else + { +- if (flags & ITER_NARROW) ++ if (iter->flags & ITER_NARROW) + iter->get_scanline = linear_get_scanline_narrow; + else + iter->get_scanline = linear_get_scanline_wide; +diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h +index ea9545f..60060a9 100644 +--- a/pixman/pixman-private.h ++++ b/pixman/pixman-private.h +@@ -233,39 +233,22 @@ void + _pixman_bits_image_setup_accessors (bits_image_t *image); + + void +-_pixman_bits_image_src_iter_init (pixman_image_t *image, +- pixman_iter_t *iter, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags); ++_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter); ++ + void +-_pixman_bits_image_dest_iter_init (pixman_image_t *image, +- pixman_iter_t *iter, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags); ++_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter); + + void +-_pixman_solid_fill_iter_init (pixman_image_t *image, +- pixman_iter_t *iter, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags); ++_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t *iter); + + void +-_pixman_linear_gradient_iter_init (pixman_image_t *image, +- pixman_iter_t *iter, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags); ++_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter); + + void +-_pixman_radial_gradient_iter_init (pixman_image_t *image, +- pixman_iter_t *iter, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags); ++_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter); + + void +-_pixman_conical_gradient_iter_init (pixman_image_t *image, +- pixman_iter_t *iter, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags); ++_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter); + + pixman_image_t * + _pixman_image_allocate (void); +@@ -413,14 +396,7 @@ typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp, + int height, + uint32_t xor); + typedef void (*pixman_iter_init_func_t) (pixman_implementation_t *imp, +- pixman_iter_t *iter, +- pixman_image_t *image, +- int x, +- int y, +- int width, +- int height, +- uint8_t *buffer, +- iter_flags_t flags); ++ pixman_iter_t *iter); + + void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp); + void _pixman_setup_combiner_functions_64 (pixman_implementation_t *imp); +diff --git a/pixman/pixman-radial-gradient.c b/pixman/pixman-radial-gradient.c +index 6523b82..5e9fd73 100644 +--- a/pixman/pixman-radial-gradient.c ++++ b/pixman/pixman-radial-gradient.c +@@ -400,12 +400,9 @@ radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) + } + + void +-_pixman_radial_gradient_iter_init (pixman_image_t *image, +- pixman_iter_t *iter, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags) ++_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter) + { +- if (flags & ITER_NARROW) ++ if (iter->flags & ITER_NARROW) + iter->get_scanline = radial_get_scanline_narrow; + else + iter->get_scanline = radial_get_scanline_wide; +diff --git a/pixman/pixman-solid-fill.c b/pixman/pixman-solid-fill.c +index 67681f2..852e135 100644 +--- a/pixman/pixman-solid-fill.c ++++ b/pixman/pixman-solid-fill.c +@@ -27,24 +27,21 @@ + #include "pixman-private.h" + + void +-_pixman_solid_fill_iter_init (pixman_image_t *image, +- pixman_iter_t *iter, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags) ++_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t *iter) + { +- if (flags & ITER_NARROW) ++ if (iter->flags & ITER_NARROW) + { +- uint32_t *b = (uint32_t *)buffer; +- uint32_t *e = b + width; +- uint32_t color = image->solid.color_32; ++ uint32_t *b = (uint32_t *)iter->buffer; ++ uint32_t *e = b + iter->width; ++ uint32_t color = iter->image->solid.color_32; + + while (b < e) + *(b++) = color; + } + else + { +- uint64_t *b = (uint64_t *)buffer; +- uint64_t *e = b + width; ++ uint64_t *b = (uint64_t *)iter->buffer; ++ uint64_t *e = b + iter->width; + uint64_t color = image->solid.color_64; + + while (b < e) +diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c +index 43a6bf2..533b858 100644 +--- a/pixman/pixman-sse2.c ++++ b/pixman/pixman-sse2.c +@@ -5978,19 +5978,21 @@ static const fetcher_info_t fetchers[] = + }; + + static void +-sse2_src_iter_init (pixman_implementation_t *imp, +- pixman_iter_t *iter, +- pixman_image_t *image, +- int x, int y, int width, int height, +- uint8_t *buffer, iter_flags_t flags) ++sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) + { ++ pixman_image_t *image = iter->image; ++ int x = iter->x; ++ int y = iter->y; ++ int width = iter->width; ++ int height = iter->height; ++ + #define FLAGS \ + (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM) + +- if ((flags & ITER_NARROW) && +- (image->common.flags & FLAGS) == FLAGS && +- x >= 0 && y >= 0 && +- x + width <= image->bits.width && ++ if ((iter->flags & ITER_NARROW) && ++ (image->common.flags & FLAGS) == FLAGS && ++ x >= 0 && y >= 0 && ++ x + width <= image->bits.width && + y + height <= image->bits.height) + { + const fetcher_info_t *f; +@@ -6002,7 +6004,7 @@ sse2_src_iter_init (pixman_implementation_t *imp, + uint8_t *b = (uint8_t *)image->bits.bits; + int s = image->bits.rowstride * 4; + +- iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8; ++ iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8; + iter->stride = s; + + iter->get_scanline = f->get_scanline; +@@ -6011,8 +6013,7 @@ sse2_src_iter_init (pixman_implementation_t *imp, + } + } + +- imp->delegate->src_iter_init ( +- imp->delegate, iter, image, x, y, width, height, buffer, flags); ++ imp->delegate->src_iter_init (imp->delegate, iter); + } + + #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0026-test-Randomize-some-tests-if-PIXMAN_RANDOMIZE_TESTS-.patch b/recipes/xorg-lib/pixman-0.21.6/0026-test-Randomize-some-tests-if-PIXMAN_RANDOMIZE_TESTS-.patch new file mode 100644 index 0000000000..8fc5b7706f --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0026-test-Randomize-some-tests-if-PIXMAN_RANDOMIZE_TESTS-.patch @@ -0,0 +1,187 @@ +From 7eb0abb5e819046537b9f809c7ec332c6679c557 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?S=C3=B8ren=20Sandmann=20Pedersen?= <ssp@redhat.com> +Date: Mon, 14 Mar 2011 14:56:22 -0400 +Subject: [PATCH 26/40] test: Randomize some tests if PIXMAN_RANDOMIZE_TESTS is set + +This patch makes so that composite and stress-test will start from a +random seed if the PIXMAN_RANDOMIZE_TESTS environment variable is +set. Running the test suite in this mode is useful to get more test +coverage. + +Also, in stress-test.c make it so that setting the initial seed causes +threads to be turned off. This makes it much easier to see when +something fails. +--- + test/composite.c | 17 ++++++++++++----- + test/stress-test.c | 37 ++++++++++++++++++++++++++----------- + test/utils.c | 10 ++++++++++ + test/utils.h | 3 +++ + 4 files changed, 51 insertions(+), 16 deletions(-) + +diff --git a/test/composite.c b/test/composite.c +index a86e5ed..e6d52b9 100644 +--- a/test/composite.c ++++ b/test/composite.c +@@ -868,7 +868,7 @@ main (int argc, char **argv) + { + #define N_TESTS (8 * 1024 * 1024) + int result = 0; +- int i; ++ uint32_t i; + + if (argc > 1) + { +@@ -890,15 +890,22 @@ main (int argc, char **argv) + } + } + ++ uint32_t seed; ++ ++ if (getenv ("PIXMAN_RANDOMIZE_TESTS")) ++ seed = get_random_seed(); ++ else ++ seed = 1; ++ + #ifdef USE_OPENMP +-# pragma omp parallel for default(none) shared(result) shared(argv) ++# pragma omp parallel for default(none) shared(result, argv, seed) + #endif +- for (i = 1; i <= N_TESTS; ++i) ++ for (i = seed; i <= N_TESTS; ++i) + { + if (!result && !run_test (i)) + { +- printf ("Test %d failed.\n", i); +- ++ printf ("Test 0x%08X failed.\n", i); ++ + result = i; + } + } +diff --git a/test/stress-test.c b/test/stress-test.c +index 166dc6d..d496f93 100644 +--- a/test/stress-test.c ++++ b/test/stress-test.c +@@ -1,4 +1,6 @@ ++#include <stdio.h> + #include "utils.h" ++#include <sys/types.h> + + #if 0 + #define fence_malloc malloc +@@ -730,11 +732,17 @@ static const pixman_op_t op_list[] = + }; + + static void +-run_test (uint32_t seed) ++run_test (uint32_t seed, pixman_bool_t verbose, uint32_t mod) + { + pixman_image_t *source, *mask, *dest; + pixman_op_t op; + ++ if (verbose) ++ { ++ if (mod == 0 || (seed % mod) == 0) ++ printf ("Seed 0x%08x\n", seed); ++ } ++ + lcg_srand (seed); + + source = create_random_image (); +@@ -787,6 +795,7 @@ main (int argc, char **argv) + uint32_t seed = 1; + uint32_t n_tests = 0xffffffff; + uint32_t mod = 0; ++ pixman_bool_t use_threads = TRUE; + uint32_t i; + + pixman_disable_out_of_bounds_workaround (); +@@ -811,6 +820,7 @@ main (int argc, char **argv) + else if (strcmp (argv[i], "-s") == 0 && i + 1 < argc) + { + get_int (argv[i + 1], &seed); ++ use_threads = FALSE; + i++; + } + else if (strcmp (argv[i], "-n") == 0 && i + 1 < argc) +@@ -825,7 +835,7 @@ main (int argc, char **argv) + + printf ("Options:\n\n" + "-n <number> Number of tests to run\n" +- "-s <seed> Seed of first test\n" ++ "-s <seed> Seed of first test (ignored if PIXMAN_RANDOMIZE_TESTS is set)\n" + "-v Print out seeds\n" + "-v <n> Print out every n'th seed\n\n"); + +@@ -836,19 +846,24 @@ main (int argc, char **argv) + if (n_tests == 0xffffffff) + n_tests = 8000; + +- /* FIXME: seed 2005763 fails in set_lum() with divide by zero */ ++ if (getenv ("PIXMAN_RANDOMIZE_TESTS")) ++ { ++ seed = get_random_seed(); ++ printf ("First seed: 0x%08x\n", seed); ++ } ++ ++ if (use_threads) ++ { + #ifdef USE_OPENMP + # pragma omp parallel for default(none) shared(verbose, n_tests, mod, seed) + #endif +- for (i = seed; i < seed + n_tests; ++i) ++ for (i = seed; i < seed + n_tests; ++i) ++ run_test (i, verbose, mod); ++ } ++ else + { +- if (verbose) +- { +- if (mod == 0 || (i % mod) == 0) +- printf ("Seed %d\n", i); +- } +- +- run_test (i); ++ for (i = seed; i < seed + n_tests; ++i) ++ run_test (i, verbose, mod); + } + + return 0; +diff --git a/test/utils.c b/test/utils.c +index 4bf02e1..56701c4 100644 +--- a/test/utils.c ++++ b/test/utils.c +@@ -455,6 +455,16 @@ gettime (void) + #endif + } + ++uint32_t ++get_random_seed (void) ++{ ++ double d = gettime(); ++ ++ lcg_srand (*(uint32_t *)&d); ++ ++ return lcg_rand_u32 (); ++} ++ + static const char *global_msg; + + static void +diff --git a/test/utils.h b/test/utils.h +index a5183f7..615ad78 100644 +--- a/test/utils.h ++++ b/test/utils.h +@@ -79,6 +79,9 @@ make_random_bytes (int n_bytes); + double + gettime (void); + ++uint32_t ++get_random_seed (void); ++ + /* main body of the fuzzer test */ + int + fuzzer_test_main (const char *test_name, +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0027-Add-simple-support-for-the-r8g8b8a8-and-r8g8b8x8-for.patch b/recipes/xorg-lib/pixman-0.21.6/0027-Add-simple-support-for-the-r8g8b8a8-and-r8g8b8x8-for.patch new file mode 100644 index 0000000000..1dbac60586 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0027-Add-simple-support-for-the-r8g8b8a8-and-r8g8b8x8-for.patch @@ -0,0 +1,206 @@ +From f05a90e5f8d1d0af60e2c684cbe9f1327c33135a Mon Sep 17 00:00:00 2001 +From: Alexandros Frantzis <alexandros.frantzis@linaro.org> +Date: Fri, 18 Mar 2011 14:36:15 +0200 +Subject: [PATCH 27/40] Add simple support for the r8g8b8a8 and r8g8b8x8 formats. + +This format is particularly useful on big-endian architectures, where RGBA in +memory/file order corresponds to r8g8b8a8 as an uint32_t. This is important +because RGBA is in some cases the only available choice (for example as a pixel +format in OpenGL ES 2.0). +--- + pixman/pixman-access.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++ + pixman/pixman.c | 6 +++ + pixman/pixman.h | 6 ++- + 3 files changed, 108 insertions(+), 1 deletions(-) + +diff --git a/pixman/pixman-access.c b/pixman/pixman-access.c +index f1ce0ba..32c4d8b 100644 +--- a/pixman/pixman-access.c ++++ b/pixman/pixman-access.c +@@ -211,6 +211,46 @@ fetch_scanline_b8g8r8x8 (pixman_image_t *image, + } + + static void ++fetch_scanline_r8g8b8a8 (pixman_image_t *image, ++ int x, ++ int y, ++ int width, ++ uint32_t * buffer, ++ const uint32_t *mask) ++{ ++ const uint32_t *bits = image->bits.bits + y * image->bits.rowstride; ++ const uint32_t *pixel = (uint32_t *)bits + x; ++ const uint32_t *end = pixel + width; ++ ++ while (pixel < end) ++ { ++ uint32_t p = READ (image, pixel++); ++ ++ *buffer++ = (((p & 0x000000ff) << 24) | (p >> 8)); ++ } ++} ++ ++static void ++fetch_scanline_r8g8b8x8 (pixman_image_t *image, ++ int x, ++ int y, ++ int width, ++ uint32_t * buffer, ++ const uint32_t *mask) ++{ ++ const uint32_t *bits = image->bits.bits + y * image->bits.rowstride; ++ const uint32_t *pixel = (uint32_t *)bits + x; ++ const uint32_t *end = pixel + width; ++ ++ while (pixel < end) ++ { ++ uint32_t p = READ (image, pixel++); ++ ++ *buffer++ = (0xff000000 | (p >> 8)); ++ } ++} ++ ++static void + fetch_scanline_x14r6g6b6 (pixman_image_t *image, + int x, + int y, +@@ -1292,6 +1332,28 @@ fetch_pixel_b8g8r8x8 (bits_image_t *image, + } + + static uint32_t ++fetch_pixel_r8g8b8a8 (bits_image_t *image, ++ int offset, ++ int line) ++{ ++ uint32_t *bits = image->bits + line * image->rowstride; ++ uint32_t pixel = READ (image, (uint32_t *)bits + offset); ++ ++ return (((pixel & 0x000000ff) << 24) | (pixel >> 8)); ++} ++ ++static uint32_t ++fetch_pixel_r8g8b8x8 (bits_image_t *image, ++ int offset, ++ int line) ++{ ++ uint32_t *bits = image->bits + line * image->rowstride; ++ uint32_t pixel = READ (image, (uint32_t *)bits + offset); ++ ++ return (0xff000000 | (pixel >> 8)); ++} ++ ++static uint32_t + fetch_pixel_x14r6g6b6 (bits_image_t *image, + int offset, + int line) +@@ -2028,6 +2090,39 @@ store_scanline_b8g8r8x8 (bits_image_t * image, + } + + static void ++store_scanline_r8g8b8a8 (bits_image_t * image, ++ int x, ++ int y, ++ int width, ++ const uint32_t *values) ++{ ++ uint32_t *bits = image->bits + image->rowstride * y; ++ uint32_t *pixel = (uint32_t *)bits + x; ++ int i; ++ ++ for (i = 0; i < width; ++i) ++ { ++ WRITE (image, pixel++, ++ ((values[i] >> 24) & 0x000000ff) | (values[i] << 8)); ++ } ++} ++ ++static void ++store_scanline_r8g8b8x8 (bits_image_t * image, ++ int x, ++ int y, ++ int width, ++ const uint32_t *values) ++{ ++ uint32_t *bits = image->bits + image->rowstride * y; ++ uint32_t *pixel = (uint32_t *)bits + x; ++ int i; ++ ++ for (i = 0; i < width; ++i) ++ WRITE (image, pixel++, (values[i] << 8)); ++} ++ ++static void + store_scanline_x14r6g6b6 (bits_image_t * image, + int x, + int y, +@@ -2845,6 +2940,8 @@ static const format_info_t accessors[] = + FORMAT_INFO (x8b8g8r8), + FORMAT_INFO (b8g8r8a8), + FORMAT_INFO (b8g8r8x8), ++ FORMAT_INFO (r8g8b8a8), ++ FORMAT_INFO (r8g8b8x8), + FORMAT_INFO (x14r6g6b6), + + /* 24bpp formats */ +diff --git a/pixman/pixman.c b/pixman/pixman.c +index ec565f9..f21af2f 100644 +--- a/pixman/pixman.c ++++ b/pixman/pixman.c +@@ -873,6 +873,8 @@ color_to_pixel (pixman_color_t * color, + format == PIXMAN_x8b8g8r8 || + format == PIXMAN_b8g8r8a8 || + format == PIXMAN_b8g8r8x8 || ++ format == PIXMAN_r8g8b8a8 || ++ format == PIXMAN_r8g8b8x8 || + format == PIXMAN_r5g6b5 || + format == PIXMAN_b5g6r5 || + format == PIXMAN_a8 || +@@ -895,6 +897,8 @@ color_to_pixel (pixman_color_t * color, + ((c & 0x0000ff00) << 8) | + ((c & 0x000000ff) << 24); + } ++ if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_RGBA) ++ c = ((c & 0xff000000) >> 24) | (c << 8); + + if (format == PIXMAN_a1) + c = c >> 31; +@@ -1105,6 +1109,8 @@ pixman_format_supported_source (pixman_format_code_t format) + case PIXMAN_x8b8g8r8: + case PIXMAN_b8g8r8a8: + case PIXMAN_b8g8r8x8: ++ case PIXMAN_r8g8b8a8: ++ case PIXMAN_r8g8b8x8: + case PIXMAN_r8g8b8: + case PIXMAN_b8g8r8: + case PIXMAN_r5g6b5: +diff --git a/pixman/pixman.h b/pixman/pixman.h +index 1305bc1..59d0760 100644 +--- a/pixman/pixman.h ++++ b/pixman/pixman.h +@@ -650,11 +650,13 @@ struct pixman_indexed + #define PIXMAN_TYPE_YUY2 6 + #define PIXMAN_TYPE_YV12 7 + #define PIXMAN_TYPE_BGRA 8 ++#define PIXMAN_TYPE_RGBA 9 + + #define PIXMAN_FORMAT_COLOR(f) \ + (PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ARGB || \ + PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ABGR || \ +- PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_BGRA) ++ PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_BGRA || \ ++ PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA) + + /* 32bpp formats */ + typedef enum { +@@ -664,6 +666,8 @@ typedef enum { + PIXMAN_x8b8g8r8 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,8,8,8), + PIXMAN_b8g8r8a8 = PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,8,8,8,8), + PIXMAN_b8g8r8x8 = PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,0,8,8,8), ++ PIXMAN_r8g8b8a8 = PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,8,8,8,8), ++ PIXMAN_r8g8b8x8 = PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,0,8,8,8), + PIXMAN_x14r6g6b6 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,6,6,6), + PIXMAN_x2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,10,10,10), + PIXMAN_a2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,2,10,10,10), +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0028-Add-support-for-the-r8g8b8a8-and-r8g8b8x8-formats-to.patch b/recipes/xorg-lib/pixman-0.21.6/0028-Add-support-for-the-r8g8b8a8-and-r8g8b8x8-formats-to.patch new file mode 100644 index 0000000000..7809e2ae4a --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0028-Add-support-for-the-r8g8b8a8-and-r8g8b8x8-formats-to.patch @@ -0,0 +1,110 @@ +From b514e63cfc58af21f7097db5a1b04292a758782a Mon Sep 17 00:00:00 2001 +From: Alexandros Frantzis <alexandros.frantzis@linaro.org> +Date: Fri, 18 Mar 2011 14:37:27 +0200 +Subject: [PATCH 28/40] Add support for the r8g8b8a8 and r8g8b8x8 formats to the tests. + +--- + test/blitters-test.c | 9 +++++++-- + test/composite.c | 9 +++++++++ + test/fetch-test.c | 10 ++++++++++ + test/stress-test.c | 2 ++ + 4 files changed, 28 insertions(+), 2 deletions(-) + +diff --git a/test/blitters-test.c b/test/blitters-test.c +index 63e7cb3..3ecfb09 100644 +--- a/test/blitters-test.c ++++ b/test/blitters-test.c +@@ -88,8 +88,11 @@ free_random_image (uint32_t initcrc, + uint32_t *data = pixman_image_get_data (img); + uint32_t mask = (1 << PIXMAN_FORMAT_DEPTH (fmt)) - 1; + +- if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_BGRA) ++ if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_BGRA || ++ PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_RGBA) ++ { + mask <<= (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt)); ++ } + + for (i = 0; i < 32; i++) + mask |= mask << (i * PIXMAN_FORMAT_BPP (fmt)); +@@ -182,6 +185,8 @@ static pixman_format_code_t img_fmt_list[] = { + PIXMAN_x8b8g8r8, + PIXMAN_b8g8r8a8, + PIXMAN_b8g8r8x8, ++ PIXMAN_r8g8b8a8, ++ PIXMAN_r8g8b8x8, + PIXMAN_x14r6g6b6, + PIXMAN_r8g8b8, + PIXMAN_b8g8r8, +@@ -412,6 +417,6 @@ main (int argc, const char *argv[]) + } + + return fuzzer_test_main("blitters", 2000000, +- 0x1DB8BDF8, ++ 0x265CDFEB, + test_composite, argc, argv); + } +diff --git a/test/composite.c b/test/composite.c +index e6d52b9..b0e0ba4 100644 +--- a/test/composite.c ++++ b/test/composite.c +@@ -102,6 +102,8 @@ static const format_t formats[] = + P(x8b8g8r8), + P(b8g8r8a8), + P(b8g8r8x8), ++ P(r8g8b8a8), ++ P(r8g8b8x8), + P(x2r10g10b10), + P(x2b10g10r10), + P(a2r10g10b10), +@@ -556,6 +558,13 @@ get_pixel (pixman_image_t *image, + bs = g + gs; + break; + ++ case PIXMAN_TYPE_RGBA: ++ as = 0; ++ bs = PIXMAN_FORMAT_BPP (format) - (b + g + r); ++ gs = b + bs; ++ rs = g + gs; ++ break; ++ + case PIXMAN_TYPE_A: + as = 0; + rs = 0; +diff --git a/test/fetch-test.c b/test/fetch-test.c +index 60bc765..feb98d9 100644 +--- a/test/fetch-test.c ++++ b/test/fetch-test.c +@@ -34,6 +34,16 @@ static testcase_t testcases[] = + NULL, + }, + { ++ PIXMAN_r8g8b8a8, ++ 2, 2, ++ 8, ++ { 0x11223300, 0x55667744, ++ 0x99aabb88, 0xddeeffcc }, ++ { 0x00112233, 0x44556677, ++ 0x8899aabb, 0xccddeeff }, ++ NULL, ++ }, ++ { + PIXMAN_g1, + 8, 2, + 4, +diff --git a/test/stress-test.c b/test/stress-test.c +index d496f93..571420a 100644 +--- a/test/stress-test.c ++++ b/test/stress-test.c +@@ -19,6 +19,8 @@ static const pixman_format_code_t image_formats[] = + PIXMAN_x8b8g8r8, + PIXMAN_b8g8r8a8, + PIXMAN_b8g8r8x8, ++ PIXMAN_r8g8b8a8, ++ PIXMAN_r8g8b8x8, + PIXMAN_x14r6g6b6, + PIXMAN_r8g8b8, + PIXMAN_b8g8r8, +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0029-test-Fix-infinite-loop-in-composite.patch b/recipes/xorg-lib/pixman-0.21.6/0029-test-Fix-infinite-loop-in-composite.patch new file mode 100644 index 0000000000..d9e4a380ee --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0029-test-Fix-infinite-loop-in-composite.patch @@ -0,0 +1,37 @@ +From ad3cbfb073fc325e1b3152898ca71b8255675957 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?S=C3=B8ren=20Sandmann=20Pedersen?= <ssp@redhat.com> +Date: Tue, 22 Mar 2011 13:42:05 -0400 +Subject: [PATCH 29/40] test: Fix infinite loop in composite + +When run in PIXMAN_RANDOMIZE_TESTS mode, this test would go into an +infinite loop because the loop started at 'seed' but the stop +condition was still N_TESTS. +--- + test/composite.c | 8 ++++---- + 1 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/test/composite.c b/test/composite.c +index b0e0ba4..9a001e5 100644 +--- a/test/composite.c ++++ b/test/composite.c +@@ -909,13 +909,13 @@ main (int argc, char **argv) + #ifdef USE_OPENMP + # pragma omp parallel for default(none) shared(result, argv, seed) + #endif +- for (i = seed; i <= N_TESTS; ++i) ++ for (i = 0; i <= N_TESTS; ++i) + { +- if (!result && !run_test (i)) ++ if (!result && !run_test (i + seed)) + { +- printf ("Test 0x%08X failed.\n", i); ++ printf ("Test 0x%08X failed.\n", seed + i); + +- result = i; ++ result = seed + i; + } + } + +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0030-ARM-tweaked-horizontal-weights-update-in-NEON-biline.patch b/recipes/xorg-lib/pixman-0.21.6/0030-ARM-tweaked-horizontal-weights-update-in-NEON-biline.patch new file mode 100644 index 0000000000..831065cb3e --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0030-ARM-tweaked-horizontal-weights-update-in-NEON-biline.patch @@ -0,0 +1,82 @@ +From 4a0ade2a1e96fe3f1bca8953be221af0b2908925 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Thu, 10 Mar 2011 15:34:10 +0200 +Subject: [PATCH 30/40] ARM: tweaked horizontal weights update in NEON bilinear scaling code + +Moving horizontal interpolation weights update instructions from the +beginning of loop to its end allows to hide some pipeline stalls and +improve performance. +--- + pixman/pixman-arm-neon-asm.S | 20 +++++++++++--------- + 1 files changed, 11 insertions(+), 9 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index 71b30ac..8788e95 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2558,8 +2558,7 @@ fname: + bilinear_load_&src_fmt d0, d1, d2 + vmull.u8 q1, d0, d28 + vmlal.u8 q1, d1, d29 +- vshr.u16 d30, d24, #8 +- /* 4 cycles bubble */ ++ /* 5 cycles bubble */ + vshll.u16 q0, d2, #8 + vmlsl.u16 q0, d2, d30 + vmlal.u16 q0, d3, d30 +@@ -2574,17 +2573,17 @@ fname: + .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt + bilinear_load_and_vertical_interpolate_two_&src_fmt \ + q1, q11, d0, d1, d20, d21, d22, d23 +- vshr.u16 q15, q12, #8 +- vadd.u16 q12, q12, q13 + vshll.u16 q0, d2, #8 + vmlsl.u16 q0, d2, d30 + vmlal.u16 q0, d3, d30 + vshll.u16 q10, d22, #8 + vmlsl.u16 q10, d22, d31 + vmlal.u16 q10, d23, d31 +- vshrn.u32 d30, q0, #16 +- vshrn.u32 d31, q10, #16 +- vmovn.u16 d0, q15 ++ vshrn.u32 d0, q0, #16 ++ vshrn.u32 d1, q10, #16 ++ vshr.u16 q15, q12, #8 ++ vadd.u16 q12, q12, q13 ++ vmovn.u16 d0, q0 + bilinear_store_&dst_fmt 2, q2, q3 + .endm + +@@ -2593,8 +2592,6 @@ fname: + q1, q11, d0, d1, d20, d21, d22, d23 \ + q3, q9, d4, d5, d16, d17, d18, d19 + pld [TMP1, PF_OFFS] +- vshr.u16 q15, q12, #8 +- vadd.u16 q12, q12, q13 + vshll.u16 q0, d2, #8 + vmlsl.u16 q0, d2, d30 + vmlal.u16 q0, d3, d30 +@@ -2614,8 +2611,10 @@ fname: + vshrn.u32 d1, q10, #16 + vshrn.u32 d4, q2, #16 + vshrn.u32 d5, q8, #16 ++ vshr.u16 q15, q12, #8 + vmovn.u16 d0, q0 + vmovn.u16 d1, q2 ++ vadd.u16 q12, q12, q13 + bilinear_store_&dst_fmt 4, q2, q3 + .endm + +@@ -2669,6 +2668,9 @@ pixman_asm_function fname + vadd.u16 d25, d25, d26 + vadd.u16 q13, q13, q13 + ++ vshr.u16 q15, q12, #8 ++ vadd.u16 q12, q12, q13 ++ + subs WIDTH, WIDTH, #4 + blt 1f + mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift) +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0031-ARM-use-aligned-memory-writes-in-NEON-bilinear-scali.patch b/recipes/xorg-lib/pixman-0.21.6/0031-ARM-use-aligned-memory-writes-in-NEON-bilinear-scali.patch new file mode 100644 index 0000000000..3c8394b983 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0031-ARM-use-aligned-memory-writes-in-NEON-bilinear-scali.patch @@ -0,0 +1,124 @@ +From f36c189475951276766b2653ae9628c4d02dc0c9 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Thu, 10 Mar 2011 16:12:23 +0200 +Subject: [PATCH 31/40] ARM: use aligned memory writes in NEON bilinear scaling code + +--- + pixman/pixman-arm-neon-asm.S | 49 ++++++++++++++++++++++++++++++------------ + 1 files changed, 35 insertions(+), 14 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index 8788e95..a4d6a9a 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2527,9 +2527,9 @@ fname: + + .macro bilinear_store_8888 numpix, tmp1, tmp2 + .if numpix == 4 +- vst1.32 {d0, d1}, [OUT]! ++ vst1.32 {d0, d1}, [OUT, :128]! + .elseif numpix == 2 +- vst1.32 {d0}, [OUT]! ++ vst1.32 {d0}, [OUT, :64]! + .elseif numpix == 1 + vst1.32 {d0[0]}, [OUT, :32]! + .else +@@ -2544,11 +2544,11 @@ fname: + vuzp.u8 d0, d2 + convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 + .if numpix == 4 +- vst1.16 {d2}, [OUT]! ++ vst1.16 {d2}, [OUT, :64]! + .elseif numpix == 2 +- vst1.32 {d2[0]}, [OUT]! ++ vst1.32 {d2[0]}, [OUT, :32]! + .elseif numpix == 1 +- vst1.16 {d2[0]}, [OUT]! ++ vst1.16 {d2[0]}, [OUT, :16]! + .else + .error bilinear_store_0565 numpix is unsupported + .endif +@@ -2622,8 +2622,7 @@ fname: + * Main template macro for generating NEON optimized bilinear scanline + * functions. + * +- * TODO: use software pipelining and aligned writes to the destination buffer +- * in order to improve performance ++ * TODO: use software pipelining in order to improve performance + * + * Bilinear scanline scaler macro template uses the following arguments: + * fname - name of the function to generate +@@ -2635,7 +2634,8 @@ fname: + */ + + .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ +- bpp_shift, prefetch_distance ++ src_bpp_shift, dst_bpp_shift, \ ++ prefetch_distance + + pixman_asm_function fname + OUT .req r0 +@@ -2666,19 +2666,40 @@ pixman_asm_function fname + vdup.u8 d28, WT + vdup.u8 d29, WB + vadd.u16 d25, d25, d26 +- vadd.u16 q13, q13, q13 + ++ /* ensure good destination alignment */ ++ cmp WIDTH, #1 ++ blt 0f ++ tst OUT, #(1 << dst_bpp_shift) ++ beq 0f ++ vshr.u16 q15, q12, #8 ++ vadd.u16 q12, q12, q13 ++ bilinear_interpolate_last_pixel src_fmt, dst_fmt ++ sub WIDTH, WIDTH, #1 ++0: ++ vadd.u16 q13, q13, q13 + vshr.u16 q15, q12, #8 + vadd.u16 q12, q12, q13 + ++ cmp WIDTH, #2 ++ blt 0f ++ tst OUT, #(1 << (dst_bpp_shift + 1)) ++ beq 0f ++ bilinear_interpolate_two_pixels src_fmt, dst_fmt ++ sub WIDTH, WIDTH, #2 ++0: ++ ++ /* start the main loop */ + subs WIDTH, WIDTH, #4 + blt 1f +- mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift) ++ mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) + 0: + bilinear_interpolate_four_pixels src_fmt, dst_fmt + subs WIDTH, WIDTH, #4 + bge 0b + 1: ++ ++ /* handle the remaining trailing pixels */ + tst WIDTH, #2 + beq 2f + bilinear_interpolate_two_pixels src_fmt, dst_fmt +@@ -2708,13 +2729,13 @@ pixman_asm_function fname + .endm + + generate_bilinear_scanline_func \ +- pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28 ++ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 2, 28 + + generate_bilinear_scanline_func \ +- pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28 ++ pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 1, 28 + + generate_bilinear_scanline_func \ +- pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28 ++ pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 2, 28 + + generate_bilinear_scanline_func \ +- pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 28 ++ pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 1, 28 +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0032-ARM-support-for-software-pipelining-in-bilinear-macr.patch b/recipes/xorg-lib/pixman-0.21.6/0032-ARM-support-for-software-pipelining-in-bilinear-macr.patch new file mode 100644 index 0000000000..c67f9c638f --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0032-ARM-support-for-software-pipelining-in-bilinear-macr.patch @@ -0,0 +1,70 @@ +From 6d296598575b8307262fac2cf438d7cc832d09d3 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Wed, 16 Mar 2011 16:33:41 +0200 +Subject: [PATCH 32/40] ARM: support for software pipelining in bilinear macros + +Now it's possible to override the main loop of bilinear scaling code +with optimized pipelined implementation. +--- + pixman/pixman-arm-neon-asm.S | 31 ++++++++++++++++++++++++++++--- + 1 files changed, 28 insertions(+), 3 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index a4d6a9a..d84f2cc 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2618,12 +2618,32 @@ fname: + bilinear_store_&dst_fmt 4, q2, q3 + .endm + ++.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt ++.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt ++ bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head ++.else ++ bilinear_interpolate_four_pixels src_fmt, dst_fmt ++.endif ++.endm ++ ++.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt ++.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt ++ bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail ++.endif ++.endm ++ ++.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt ++ bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head ++.else ++ bilinear_interpolate_four_pixels src_fmt, dst_fmt ++.endif ++.endm ++ + /* + * Main template macro for generating NEON optimized bilinear scanline + * functions. + * +- * TODO: use software pipelining in order to improve performance +- * + * Bilinear scanline scaler macro template uses the following arguments: + * fname - name of the function to generate + * src_fmt - source color format (8888 or 0565) +@@ -2693,10 +2713,15 @@ pixman_asm_function fname + subs WIDTH, WIDTH, #4 + blt 1f + mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) ++ bilinear_interpolate_four_pixels_head src_fmt, dst_fmt ++ subs WIDTH, WIDTH, #4 ++ blt 5f + 0: +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt + subs WIDTH, WIDTH, #4 + bge 0b ++5: ++ bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt + 1: + + /* handle the remaining trailing pixels */ +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0033-ARM-use-less-ARM-instructions-in-NEON-bilinear-scali.patch b/recipes/xorg-lib/pixman-0.21.6/0033-ARM-use-less-ARM-instructions-in-NEON-bilinear-scali.patch new file mode 100644 index 0000000000..1d66979f99 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0033-ARM-use-less-ARM-instructions-in-NEON-bilinear-scali.patch @@ -0,0 +1,168 @@ +From ec2da8e651767421a8403bf0810445fdec1315ba Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Mon, 21 Mar 2011 18:41:53 +0200 +Subject: [PATCH 33/40] ARM: use less ARM instructions in NEON bilinear scaling code + +This reduces code size and also puts less pressure on the +instruction decoder. +--- + pixman/pixman-arm-neon-asm.S | 79 ++++++++++++++++++++---------------------- + 1 files changed, 38 insertions(+), 41 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index d84f2cc..9878bf7 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2412,21 +2412,19 @@ fname: + */ + + .macro bilinear_load_8888 reg1, reg2, tmp +- mov TMP2, X, asr #16 ++ mov TMP1, X, asr #16 + add X, X, UX +- add TMP1, TOP, TMP2, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {reg1}, [TMP1] +- vld1.32 {reg2}, [TMP2] ++ add TMP1, TOP, TMP1, asl #2 ++ vld1.32 {reg1}, [TMP1], STRIDE ++ vld1.32 {reg2}, [TMP1] + .endm + + .macro bilinear_load_0565 reg1, reg2, tmp +- mov TMP2, X, asr #16 ++ mov TMP1, X, asr #16 + add X, X, UX +- add TMP1, TOP, TMP2, asl #1 +- add TMP2, BOTTOM, TMP2, asl #1 +- vld1.32 {reg2[0]}, [TMP1] +- vld1.32 {reg2[1]}, [TMP2] ++ add TMP1, TOP, TMP1, asl #1 ++ vld1.32 {reg2[0]}, [TMP1], STRIDE ++ vld1.32 {reg2[1]}, [TMP1] + convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp + .endm + +@@ -2454,18 +2452,16 @@ fname: + .macro bilinear_load_and_vertical_interpolate_two_0565 \ + acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi + +- mov TMP2, X, asr #16 ++ mov TMP1, X, asr #16 + add X, X, UX +- mov TMP4, X, asr #16 ++ add TMP1, TOP, TMP1, asl #1 ++ mov TMP2, X, asr #16 + add X, X, UX +- add TMP1, TOP, TMP2, asl #1 +- add TMP2, BOTTOM, TMP2, asl #1 +- add TMP3, TOP, TMP4, asl #1 +- add TMP4, BOTTOM, TMP4, asl #1 +- vld1.32 {acc2lo[0]}, [TMP1] +- vld1.32 {acc2hi[0]}, [TMP3] +- vld1.32 {acc2lo[1]}, [TMP2] +- vld1.32 {acc2hi[1]}, [TMP4] ++ add TMP2, TOP, TMP2, asl #1 ++ vld1.32 {acc2lo[0]}, [TMP1], STRIDE ++ vld1.32 {acc2hi[0]}, [TMP2], STRIDE ++ vld1.32 {acc2lo[1]}, [TMP1] ++ vld1.32 {acc2hi[1]}, [TMP2] + convert_0565_to_x888 acc2, reg3, reg2, reg1 + vzip.u8 reg1, reg3 + vzip.u8 reg2, reg4 +@@ -2481,34 +2477,30 @@ fname: + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + +- mov TMP2, X, asr #16 ++ mov TMP1, X, asr #16 + add X, X, UX +- mov TMP4, X, asr #16 ++ add TMP1, TOP, TMP1, asl #1 ++ mov TMP2, X, asr #16 + add X, X, UX +- add TMP1, TOP, TMP2, asl #1 +- add TMP2, BOTTOM, TMP2, asl #1 +- add TMP3, TOP, TMP4, asl #1 +- add TMP4, BOTTOM, TMP4, asl #1 +- vld1.32 {xacc2lo[0]}, [TMP1] +- vld1.32 {xacc2hi[0]}, [TMP3] +- vld1.32 {xacc2lo[1]}, [TMP2] +- vld1.32 {xacc2hi[1]}, [TMP4] ++ add TMP2, TOP, TMP2, asl #1 ++ vld1.32 {xacc2lo[0]}, [TMP1], STRIDE ++ vld1.32 {xacc2hi[0]}, [TMP2], STRIDE ++ vld1.32 {xacc2lo[1]}, [TMP1] ++ vld1.32 {xacc2hi[1]}, [TMP2] + convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 +- mov TMP2, X, asr #16 ++ mov TMP1, X, asr #16 + add X, X, UX +- mov TMP4, X, asr #16 ++ add TMP1, TOP, TMP1, asl #1 ++ mov TMP2, X, asr #16 + add X, X, UX +- add TMP1, TOP, TMP2, asl #1 +- add TMP2, BOTTOM, TMP2, asl #1 +- add TMP3, TOP, TMP4, asl #1 +- add TMP4, BOTTOM, TMP4, asl #1 +- vld1.32 {yacc2lo[0]}, [TMP1] ++ add TMP2, TOP, TMP2, asl #1 ++ vld1.32 {yacc2lo[0]}, [TMP1], STRIDE + vzip.u8 xreg1, xreg3 +- vld1.32 {yacc2hi[0]}, [TMP3] ++ vld1.32 {yacc2hi[0]}, [TMP2], STRIDE + vzip.u8 xreg2, xreg4 +- vld1.32 {yacc2lo[1]}, [TMP2] ++ vld1.32 {yacc2lo[1]}, [TMP1] + vzip.u8 xreg3, xreg4 +- vld1.32 {yacc2hi[1]}, [TMP4] ++ vld1.32 {yacc2hi[1]}, [TMP2] + vzip.u8 xreg1, xreg2 + convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 + vmull.u8 xacc1, xreg1, d28 +@@ -2592,6 +2584,7 @@ fname: + q1, q11, d0, d1, d20, d21, d22, d23 \ + q3, q9, d4, d5, d16, d17, d18, d19 + pld [TMP1, PF_OFFS] ++ sub TMP1, TMP1, STRIDE + vshll.u16 q0, d2, #8 + vmlsl.u16 q0, d2, d30 + vmlal.u16 q0, d3, d30 +@@ -2671,6 +2664,7 @@ pixman_asm_function fname + PF_OFFS .req r7 + TMP3 .req r8 + TMP4 .req r9 ++ STRIDE .req r2 + + mov ip, sp + push {r4, r5, r6, r7, r8, r9} +@@ -2678,6 +2672,9 @@ pixman_asm_function fname + ldmia ip, {WB, X, UX, WIDTH} + mul PF_OFFS, PF_OFFS, UX + ++ sub STRIDE, BOTTOM, TOP ++ .unreq BOTTOM ++ + cmp WIDTH, #0 + ble 3f + +@@ -2738,7 +2735,6 @@ pixman_asm_function fname + + .unreq OUT + .unreq TOP +- .unreq BOTTOM + .unreq WT + .unreq WB + .unreq X +@@ -2749,6 +2745,7 @@ pixman_asm_function fname + .unreq PF_OFFS + .unreq TMP3 + .unreq TMP4 ++ .unreq STRIDE + .endfunc + + .endm +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0034-ARM-support-different-levels-of-loop-unrolling-in-bi.patch b/recipes/xorg-lib/pixman-0.21.6/0034-ARM-support-different-levels-of-loop-unrolling-in-bi.patch new file mode 100644 index 0000000000..82661f0869 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0034-ARM-support-different-levels-of-loop-unrolling-in-bi.patch @@ -0,0 +1,156 @@ +From cd20ceb7602348ecbfa0db1756dc548a0bad3c9d Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Thu, 17 Mar 2011 19:42:01 +0200 +Subject: [PATCH 34/40] ARM: support different levels of loop unrolling in bilinear scaler + +Now an extra 'flag' parameter is supported in bilinear scaline scaling +function generation macro. It can be used to enable 4 or 8 pixels per +loop iteration unrolling and provide save/restore code for d8-d15 +registers. +--- + pixman/pixman-arm-neon-asm.S | 84 ++++++++++++++++++++++++++++++++++++++---- + 1 files changed, 76 insertions(+), 8 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index 9878bf7..6141770 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2633,6 +2633,36 @@ fname: + .endif + .endm + ++.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt ++.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt ++ bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head ++.else ++ bilinear_interpolate_four_pixels_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++.endif ++.endm ++ ++.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt ++.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt ++ bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail ++.else ++ bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt ++.endif ++.endm ++ ++.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt ++.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt ++ bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head ++.else ++ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++.endif ++.endm ++ ++.set BILINEAR_FLAG_UNROLL_4, 0 ++.set BILINEAR_FLAG_UNROLL_8, 1 ++.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 ++ + /* + * Main template macro for generating NEON optimized bilinear scanline + * functions. +@@ -2648,7 +2678,7 @@ fname: + + .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ + src_bpp_shift, dst_bpp_shift, \ +- prefetch_distance ++ prefetch_distance, flags + + pixman_asm_function fname + OUT .req r0 +@@ -2672,6 +2702,10 @@ pixman_asm_function fname + ldmia ip, {WB, X, UX, WIDTH} + mul PF_OFFS, PF_OFFS, UX + ++.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 ++ vpush {d8-d15} ++.endif ++ + sub STRIDE, BOTTOM, TOP + .unreq BOTTOM + +@@ -2705,8 +2739,34 @@ pixman_asm_function fname + bilinear_interpolate_two_pixels src_fmt, dst_fmt + sub WIDTH, WIDTH, #2 + 0: +- +- /* start the main loop */ ++.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 ++/*********** 8 pixels per iteration *****************/ ++ cmp WIDTH, #4 ++ blt 0f ++ tst OUT, #(1 << (dst_bpp_shift + 2)) ++ beq 0f ++ bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ sub WIDTH, WIDTH, #4 ++0: ++ subs WIDTH, WIDTH, #8 ++ blt 1f ++ mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) ++ bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt ++ subs WIDTH, WIDTH, #8 ++ blt 5f ++0: ++ bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt ++ subs WIDTH, WIDTH, #8 ++ bge 0b ++5: ++ bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt ++1: ++ tst WIDTH, #4 ++ beq 2f ++ bilinear_interpolate_four_pixels src_fmt, dst_fmt ++2: ++.else ++/*********** 4 pixels per iteration *****************/ + subs WIDTH, WIDTH, #4 + blt 1f + mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) +@@ -2720,7 +2780,8 @@ pixman_asm_function fname + 5: + bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt + 1: +- ++/****************************************************/ ++.endif + /* handle the remaining trailing pixels */ + tst WIDTH, #2 + beq 2f +@@ -2730,6 +2791,9 @@ pixman_asm_function fname + beq 3f + bilinear_interpolate_last_pixel src_fmt, dst_fmt + 3: ++.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 ++ vpop {d8-d15} ++.endif + pop {r4, r5, r6, r7, r8, r9} + bx lr + +@@ -2751,13 +2815,17 @@ pixman_asm_function fname + .endm + + generate_bilinear_scanline_func \ +- pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 2, 28 ++ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ ++ 2, 2, 28, BILINEAR_FLAG_UNROLL_4 + + generate_bilinear_scanline_func \ +- pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 1, 28 ++ pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \ ++ 2, 1, 28, BILINEAR_FLAG_UNROLL_4 + + generate_bilinear_scanline_func \ +- pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 2, 28 ++ pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \ ++ 1, 2, 28, BILINEAR_FLAG_UNROLL_4 + + generate_bilinear_scanline_func \ +- pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 1, 28 ++ pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \ ++ 1, 1, 28, BILINEAR_FLAG_UNROLL_4 +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0035-ARM-pipelined-NEON-implementation-of-bilinear-scaled.patch b/recipes/xorg-lib/pixman-0.21.6/0035-ARM-pipelined-NEON-implementation-of-bilinear-scaled.patch new file mode 100644 index 0000000000..c0d485cae4 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0035-ARM-pipelined-NEON-implementation-of-bilinear-scaled.patch @@ -0,0 +1,166 @@ +From d3b1ca20fe8af20ca097dcc8799ef25cee03dd6b Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Wed, 16 Mar 2011 17:24:49 +0200 +Subject: [PATCH 35/40] ARM: pipelined NEON implementation of bilinear scaled 'src_8888_8888' + +Performance of the inner loop when working with the data in L1 cache: + ARM Cortex-A8: 41 cycles per 4 pixels (no stalls and partial dual issue) + ARM Cortex-A9: 48 cycles per 4 pixels (no stalls) + +It might be still possible to improve performance even more on ARM Cortex-A8 +with a better use of dual issue. + +Benchmark on ARM Cortex-A8 r1p3 @600MHz, 32-bit LPDDR @166MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=40.38 MPix/s + after: op=1, src=20028888, dst=20028888, speed=48.47 MPix/s + +Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=79.68 MPix/s + after: op=1, src=20028888, dst=20028888, speed=93.11 MPix/s +--- + pixman/pixman-arm-neon-asm.S | 127 ++++++++++++++++++++++++++++++++++++++++++ + 1 files changed, 127 insertions(+), 0 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index 6141770..326e085 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2814,6 +2814,133 @@ pixman_asm_function fname + + .endm + ++/*****************************************************************************/ ++ ++.set have_bilinear_interpolate_four_pixels_8888_8888, 1 ++ ++.macro bilinear_interpolate_four_pixels_8888_8888_head ++ mov TMP1, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, asl #2 ++ ++ vld1.32 {d22}, [TMP1], STRIDE ++ vld1.32 {d23}, [TMP1] ++ mov TMP3, X, asr #16 ++ add X, X, UX ++ add TMP3, TOP, TMP3, asl #2 ++ vmull.u8 q8, d22, d28 ++ vmlal.u8 q8, d23, d29 ++ ++ vld1.32 {d22}, [TMP2], STRIDE ++ vld1.32 {d23}, [TMP2] ++ mov TMP4, X, asr #16 ++ add X, X, UX ++ add TMP4, TOP, TMP4, asl #2 ++ vmull.u8 q9, d22, d28 ++ vmlal.u8 q9, d23, d29 ++ ++ vld1.32 {d22}, [TMP3], STRIDE ++ vld1.32 {d23}, [TMP3] ++ vmull.u8 q10, d22, d28 ++ vmlal.u8 q10, d23, d29 ++ ++ vshll.u16 q0, d16, #8 ++ vmlsl.u16 q0, d16, d30 ++ vmlal.u16 q0, d17, d30 ++ ++ pld [TMP4, PF_OFFS] ++ vld1.32 {d16}, [TMP4], STRIDE ++ vld1.32 {d17}, [TMP4] ++ pld [TMP4, PF_OFFS] ++ vmull.u8 q11, d16, d28 ++ vmlal.u8 q11, d17, d29 ++ ++ vshll.u16 q1, d18, #8 ++ vmlsl.u16 q1, d18, d31 ++.endm ++ ++.macro bilinear_interpolate_four_pixels_8888_8888_tail ++ vmlal.u16 q1, d19, d31 ++ vshr.u16 q15, q12, #8 ++ vshll.u16 q2, d20, #8 ++ vmlsl.u16 q2, d20, d30 ++ vmlal.u16 q2, d21, d30 ++ vshll.u16 q3, d22, #8 ++ vmlsl.u16 q3, d22, d31 ++ vmlal.u16 q3, d23, d31 ++ vadd.u16 q12, q12, q13 ++ vshrn.u32 d0, q0, #16 ++ vshrn.u32 d1, q1, #16 ++ vshrn.u32 d4, q2, #16 ++ vshr.u16 q15, q12, #8 ++ vshrn.u32 d5, q3, #16 ++ vmovn.u16 d6, q0 ++ vmovn.u16 d7, q2 ++ vadd.u16 q12, q12, q13 ++ vst1.32 {d6, d7}, [OUT, :128]! ++.endm ++ ++.macro bilinear_interpolate_four_pixels_8888_8888_tail_head ++ mov TMP1, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, asl #2 ++ vmlal.u16 q1, d19, d31 ++ vshr.u16 q15, q12, #8 ++ vshll.u16 q2, d20, #8 ++ vmlsl.u16 q2, d20, d30 ++ vmlal.u16 q2, d21, d30 ++ vshll.u16 q3, d22, #8 ++ vld1.32 {d20}, [TMP1], STRIDE ++ vmlsl.u16 q3, d22, d31 ++ vmlal.u16 q3, d23, d31 ++ vld1.32 {d21}, [TMP1] ++ vmull.u8 q8, d20, d28 ++ vmlal.u8 q8, d21, d29 ++ vshrn.u32 d0, q0, #16 ++ vshrn.u32 d1, q1, #16 ++ vshrn.u32 d4, q2, #16 ++ vld1.32 {d22}, [TMP2], STRIDE ++ vshrn.u32 d5, q3, #16 ++ vadd.u16 q12, q12, q13 ++ vld1.32 {d23}, [TMP2] ++ vmull.u8 q9, d22, d28 ++ mov TMP3, X, asr #16 ++ add X, X, UX ++ add TMP3, TOP, TMP3, asl #2 ++ mov TMP4, X, asr #16 ++ add X, X, UX ++ add TMP4, TOP, TMP4, asl #2 ++ vmlal.u8 q9, d23, d29 ++ vld1.32 {d22}, [TMP3], STRIDE ++ vshr.u16 q15, q12, #8 ++ vld1.32 {d23}, [TMP3] ++ vmull.u8 q10, d22, d28 ++ vmlal.u8 q10, d23, d29 ++ vmovn.u16 d6, q0 ++ vshll.u16 q0, d16, #8 ++ vmovn.u16 d7, q2 ++ vmlsl.u16 q0, d16, d30 ++ vmlal.u16 q0, d17, d30 ++ pld [TMP4, PF_OFFS] ++ vld1.32 {d16}, [TMP4], STRIDE ++ vadd.u16 q12, q12, q13 ++ vld1.32 {d17}, [TMP4] ++ pld [TMP4, PF_OFFS] ++ vmull.u8 q11, d16, d28 ++ vmlal.u8 q11, d17, d29 ++ vst1.32 {d6, d7}, [OUT, :128]! ++ vshll.u16 q1, d18, #8 ++ vmlsl.u16 q1, d18, d31 ++.endm ++ ++/*****************************************************************************/ ++ + generate_bilinear_scanline_func \ + pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ + 2, 2, 28, BILINEAR_FLAG_UNROLL_4 +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0036-ARM-pipelined-NEON-implementation-of-bilinear-scaled.patch b/recipes/xorg-lib/pixman-0.21.6/0036-ARM-pipelined-NEON-implementation-of-bilinear-scaled.patch new file mode 100644 index 0000000000..4fca16fb9e --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0036-ARM-pipelined-NEON-implementation-of-bilinear-scaled.patch @@ -0,0 +1,283 @@ +From dfccf9b97acbff6e847e4e52c5dec0a4297d30a0 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Mon, 21 Mar 2011 20:25:27 +0200 +Subject: [PATCH 36/40] ARM: pipelined NEON implementation of bilinear scaled 'src_8888_0565' + +Benchmark on ARM Cortex-A8 r1p3 @600MHz, 32-bit LPDDR @166MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=10020565, speed=33.59 MPix/s + after: op=1, src=20028888, dst=10020565, speed=46.25 MPix/s + +Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=10020565, speed=63.86 MPix/s + after: op=1, src=20028888, dst=10020565, speed=84.22 MPix/s +--- + pixman/pixman-arm-neon-asm.S | 245 +++++++++++++++++++++++++++++++++++++++++- + 1 files changed, 244 insertions(+), 1 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index 326e085..e560bdf 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2941,13 +2941,256 @@ pixman_asm_function fname + + /*****************************************************************************/ + ++.set have_bilinear_interpolate_eight_pixels_8888_0565, 1 ++ ++.macro bilinear_interpolate_eight_pixels_8888_0565_head ++ mov TMP1, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, asl #2 ++ vld1.32 {d20}, [TMP1], STRIDE ++ vld1.32 {d21}, [TMP1] ++ vmull.u8 q8, d20, d28 ++ vmlal.u8 q8, d21, d29 ++ vld1.32 {d22}, [TMP2], STRIDE ++ vld1.32 {d23}, [TMP2] ++ vmull.u8 q9, d22, d28 ++ mov TMP3, X, asr #16 ++ add X, X, UX ++ add TMP3, TOP, TMP3, asl #2 ++ mov TMP4, X, asr #16 ++ add X, X, UX ++ add TMP4, TOP, TMP4, asl #2 ++ vmlal.u8 q9, d23, d29 ++ vld1.32 {d22}, [TMP3], STRIDE ++ vld1.32 {d23}, [TMP3] ++ vmull.u8 q10, d22, d28 ++ vmlal.u8 q10, d23, d29 ++ vshll.u16 q0, d16, #8 ++ vmlsl.u16 q0, d16, d30 ++ vmlal.u16 q0, d17, d30 ++ pld [TMP4, PF_OFFS] ++ vld1.32 {d16}, [TMP4], STRIDE ++ vld1.32 {d17}, [TMP4] ++ pld [TMP4, PF_OFFS] ++ vmull.u8 q11, d16, d28 ++ vmlal.u8 q11, d17, d29 ++ vshll.u16 q1, d18, #8 ++ vmlsl.u16 q1, d18, d31 ++ ++ mov TMP1, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, asl #2 ++ vmlal.u16 q1, d19, d31 ++ vshr.u16 q15, q12, #8 ++ vshll.u16 q2, d20, #8 ++ vmlsl.u16 q2, d20, d30 ++ vmlal.u16 q2, d21, d30 ++ vshll.u16 q3, d22, #8 ++ vld1.32 {d20}, [TMP1], STRIDE ++ vmlsl.u16 q3, d22, d31 ++ vmlal.u16 q3, d23, d31 ++ vld1.32 {d21}, [TMP1] ++ vmull.u8 q8, d20, d28 ++ vmlal.u8 q8, d21, d29 ++ vshrn.u32 d0, q0, #16 ++ vshrn.u32 d1, q1, #16 ++ vshrn.u32 d4, q2, #16 ++ vld1.32 {d22}, [TMP2], STRIDE ++ vshrn.u32 d5, q3, #16 ++ vadd.u16 q12, q12, q13 ++ vld1.32 {d23}, [TMP2] ++ vmull.u8 q9, d22, d28 ++ mov TMP3, X, asr #16 ++ add X, X, UX ++ add TMP3, TOP, TMP3, asl #2 ++ mov TMP4, X, asr #16 ++ add X, X, UX ++ add TMP4, TOP, TMP4, asl #2 ++ vmlal.u8 q9, d23, d29 ++ vld1.32 {d22}, [TMP3], STRIDE ++ vshr.u16 q15, q12, #8 ++ vld1.32 {d23}, [TMP3] ++ vmull.u8 q10, d22, d28 ++ vmlal.u8 q10, d23, d29 ++ vmovn.u16 d8, q0 ++ vshll.u16 q0, d16, #8 ++ vmovn.u16 d9, q2 ++ vmlsl.u16 q0, d16, d30 ++ vmlal.u16 q0, d17, d30 ++ pld [TMP4, PF_OFFS] ++ vld1.32 {d16}, [TMP4], STRIDE ++ vadd.u16 q12, q12, q13 ++ vld1.32 {d17}, [TMP4] ++ pld [TMP4, PF_OFFS] ++ vmull.u8 q11, d16, d28 ++ vmlal.u8 q11, d17, d29 ++ vshll.u16 q1, d18, #8 ++ vmlsl.u16 q1, d18, d31 ++.endm ++ ++.macro bilinear_interpolate_eight_pixels_8888_0565_tail ++ vmlal.u16 q1, d19, d31 ++ vshr.u16 q15, q12, #8 ++ vshll.u16 q2, d20, #8 ++ vmlsl.u16 q2, d20, d30 ++ vmlal.u16 q2, d21, d30 ++ vshll.u16 q3, d22, #8 ++ vmlsl.u16 q3, d22, d31 ++ vmlal.u16 q3, d23, d31 ++ vadd.u16 q12, q12, q13 ++ vshrn.u32 d0, q0, #16 ++ vshrn.u32 d1, q1, #16 ++ vshrn.u32 d4, q2, #16 ++ vshr.u16 q15, q12, #8 ++ vshrn.u32 d5, q3, #16 ++ vmovn.u16 d10, q0 ++ vmovn.u16 d11, q2 ++ vadd.u16 q12, q12, q13 ++ ++ vuzp.u8 d8, d9 ++ vuzp.u8 d10, d11 ++ vuzp.u8 d9, d11 ++ vuzp.u8 d8, d10 ++ vshll.u8 q6, d9, #8 ++ vshll.u8 q5, d10, #8 ++ vshll.u8 q7, d8, #8 ++ vsri.u16 q5, q6, #5 ++ vsri.u16 q5, q7, #11 ++ vst1.32 {d10, d11}, [OUT, :128]! ++.endm ++ ++.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head ++ mov TMP1, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, asl #2 ++ vmlal.u16 q1, d19, d31 ++ vshr.u16 q15, q12, #8 ++ vuzp.u8 d8, d9 ++ vshll.u16 q2, d20, #8 ++ vmlsl.u16 q2, d20, d30 ++ vmlal.u16 q2, d21, d30 ++ vshll.u16 q3, d22, #8 ++ vld1.32 {d20}, [TMP1], STRIDE ++ vmlsl.u16 q3, d22, d31 ++ vmlal.u16 q3, d23, d31 ++ vld1.32 {d21}, [TMP1] ++ vmull.u8 q8, d20, d28 ++ vmlal.u8 q8, d21, d29 ++ vshrn.u32 d0, q0, #16 ++ vshrn.u32 d1, q1, #16 ++ vshrn.u32 d4, q2, #16 ++ vld1.32 {d22}, [TMP2], STRIDE ++ vshrn.u32 d5, q3, #16 ++ vadd.u16 q12, q12, q13 ++ vld1.32 {d23}, [TMP2] ++ vmull.u8 q9, d22, d28 ++ mov TMP3, X, asr #16 ++ add X, X, UX ++ add TMP3, TOP, TMP3, asl #2 ++ mov TMP4, X, asr #16 ++ add X, X, UX ++ add TMP4, TOP, TMP4, asl #2 ++ vmlal.u8 q9, d23, d29 ++ vld1.32 {d22}, [TMP3], STRIDE ++ vshr.u16 q15, q12, #8 ++ vld1.32 {d23}, [TMP3] ++ vmull.u8 q10, d22, d28 ++ vmlal.u8 q10, d23, d29 ++ vmovn.u16 d10, q0 ++ vshll.u16 q0, d16, #8 ++ vmovn.u16 d11, q2 ++ vmlsl.u16 q0, d16, d30 ++ vmlal.u16 q0, d17, d30 ++ pld [TMP4, PF_OFFS] ++ vld1.32 {d16}, [TMP4], STRIDE ++ vadd.u16 q12, q12, q13 ++ vld1.32 {d17}, [TMP4] ++ pld [TMP4, PF_OFFS] ++ vmull.u8 q11, d16, d28 ++ vmlal.u8 q11, d17, d29 ++ vuzp.u8 d10, d11 ++ vshll.u16 q1, d18, #8 ++ vmlsl.u16 q1, d18, d31 ++ ++ mov TMP1, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, asl #2 ++ vmlal.u16 q1, d19, d31 ++ vuzp.u8 d9, d11 ++ vshr.u16 q15, q12, #8 ++ vshll.u16 q2, d20, #8 ++ vuzp.u8 d8, d10 ++ vmlsl.u16 q2, d20, d30 ++ vmlal.u16 q2, d21, d30 ++ vshll.u16 q3, d22, #8 ++ vld1.32 {d20}, [TMP1], STRIDE ++ vmlsl.u16 q3, d22, d31 ++ vmlal.u16 q3, d23, d31 ++ vld1.32 {d21}, [TMP1] ++ vmull.u8 q8, d20, d28 ++ vmlal.u8 q8, d21, d29 ++ vshll.u8 q6, d9, #8 ++ vshll.u8 q5, d10, #8 ++ vshll.u8 q7, d8, #8 ++ vshrn.u32 d0, q0, #16 ++ vsri.u16 q5, q6, #5 ++ vshrn.u32 d1, q1, #16 ++ vsri.u16 q5, q7, #11 ++ vshrn.u32 d4, q2, #16 ++ vld1.32 {d22}, [TMP2], STRIDE ++ vshrn.u32 d5, q3, #16 ++ vadd.u16 q12, q12, q13 ++ vld1.32 {d23}, [TMP2] ++ vmull.u8 q9, d22, d28 ++ mov TMP3, X, asr #16 ++ add X, X, UX ++ add TMP3, TOP, TMP3, asl #2 ++ mov TMP4, X, asr #16 ++ add X, X, UX ++ add TMP4, TOP, TMP4, asl #2 ++ vmlal.u8 q9, d23, d29 ++ vld1.32 {d22}, [TMP3], STRIDE ++ vshr.u16 q15, q12, #8 ++ vld1.32 {d23}, [TMP3] ++ vmull.u8 q10, d22, d28 ++ vmlal.u8 q10, d23, d29 ++ vmovn.u16 d8, q0 ++ vshll.u16 q0, d16, #8 ++ vmovn.u16 d9, q2 ++ vmlsl.u16 q0, d16, d30 ++ vmlal.u16 q0, d17, d30 ++ pld [TMP4, PF_OFFS] ++ vld1.32 {d16}, [TMP4], STRIDE ++ vadd.u16 q12, q12, q13 ++ vld1.32 {d17}, [TMP4] ++ pld [TMP4, PF_OFFS] ++ vmull.u8 q11, d16, d28 ++ vmlal.u8 q11, d17, d29 ++ vshll.u16 q1, d18, #8 ++ vst1.32 {d10, d11}, [OUT, :128]! ++ vmlsl.u16 q1, d18, d31 ++.endm ++/*****************************************************************************/ ++ + generate_bilinear_scanline_func \ + pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ + 2, 2, 28, BILINEAR_FLAG_UNROLL_4 + + generate_bilinear_scanline_func \ + pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \ +- 2, 1, 28, BILINEAR_FLAG_UNROLL_4 ++ 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS + + generate_bilinear_scanline_func \ + pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \ +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0037-Generic-C-implementation-of-pixman_blt-with-overlapp.patch b/recipes/xorg-lib/pixman-0.21.6/0037-Generic-C-implementation-of-pixman_blt-with-overlapp.patch new file mode 100644 index 0000000000..e03823b185 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0037-Generic-C-implementation-of-pixman_blt-with-overlapp.patch @@ -0,0 +1,114 @@ +From ab52f97fa306f73b51f797a33614280d31ccb978 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Tue, 16 Mar 2010 16:55:28 +0100 +Subject: [PATCH 37/40] Generic C implementation of pixman_blt with overlapping support + +Uses memcpy/memmove functions to copy pixels, can handle the +case when both source and destination areas are in the same +image (this is useful for scrolling). + +It is assumed that copying direction is only important when +using the same image for both source and destination (and +src_stride == dst_stride). Copying direction is undefined +for the images with different source and destination stride +which happen to be in the overlapped areas (but this is an +unrealistic case anyway). +--- + pixman/pixman-general.c | 21 ++++++++++++++++++--- + pixman/pixman-private.h | 43 +++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 61 insertions(+), 3 deletions(-) + +diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c +index 727affc..fa448f7 100644 +--- a/pixman/pixman-general.c ++++ b/pixman/pixman-general.c +@@ -238,9 +238,24 @@ general_blt (pixman_implementation_t *imp, + int width, + int height) + { +- /* We can't blit unless we have sse2 or mmx */ +- +- return FALSE; ++ uint8_t *dst_bytes = (uint8_t *)dst_bits; ++ uint8_t *src_bytes = (uint8_t *)src_bits; ++ int bpp; ++ ++ if (src_bpp != dst_bpp || src_bpp & 7) ++ return FALSE; ++ ++ bpp = src_bpp >> 3; ++ width *= bpp; ++ src_stride *= 4; ++ dst_stride *= 4; ++ pixman_blt_helper (src_bytes + src_y * src_stride + src_x * bpp, ++ dst_bytes + dst_y * dst_stride + dst_x * bpp, ++ src_stride, ++ dst_stride, ++ width, ++ height); ++ return TRUE; + } + + static pixman_bool_t +diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h +index 60060a9..5369ad9 100644 +--- a/pixman/pixman-private.h ++++ b/pixman/pixman-private.h +@@ -10,6 +10,7 @@ + + #include "pixman.h" + #include <time.h> ++#include <string.h> + #include <assert.h> + #include <stdio.h> + #include <string.h> +@@ -899,4 +900,46 @@ void pixman_timer_register (pixman_timer_t *timer); + + #endif /* PIXMAN_TIMERS */ + ++/* a helper function, can blit 8-bit images with src/dst overlapping support */ ++static inline void ++pixman_blt_helper (uint8_t *src_bytes, ++ uint8_t *dst_bytes, ++ int src_stride, ++ int dst_stride, ++ int width, ++ int height) ++{ ++ /* ++ * The second part of this check is not strictly needed, but it prevents ++ * unnecessary upside-down processing of areas which belong to different ++ * images. Upside-down processing can be slower with fixed-distance-ahead ++ * prefetch and perceived as having more tearing. ++ */ ++ if (src_bytes < dst_bytes + width && ++ src_bytes + src_stride * height > dst_bytes) ++ { ++ src_bytes += src_stride * height - src_stride; ++ dst_bytes += dst_stride * height - dst_stride; ++ dst_stride = -dst_stride; ++ src_stride = -src_stride; ++ /* Horizontal scrolling to the left needs memmove */ ++ if (src_bytes + width > dst_bytes) ++ { ++ while (--height >= 0) ++ { ++ memmove (dst_bytes, src_bytes, width); ++ dst_bytes += dst_stride; ++ src_bytes += src_stride; ++ } ++ return; ++ } ++ } ++ while (--height >= 0) ++ { ++ memcpy (dst_bytes, src_bytes, width); ++ dst_bytes += dst_stride; ++ src_bytes += src_stride; ++ } ++} ++ + #endif /* PIXMAN_PRIVATE_H */ +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0038-Support-of-overlapping-src-dst-for-pixman_blt_mmx.patch b/recipes/xorg-lib/pixman-0.21.6/0038-Support-of-overlapping-src-dst-for-pixman_blt_mmx.patch new file mode 100644 index 0000000000..7c0f7ad5bd --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0038-Support-of-overlapping-src-dst-for-pixman_blt_mmx.patch @@ -0,0 +1,91 @@ +From 2cde9110695c2b595eaf885eee40b118286652f9 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Thu, 22 Oct 2009 05:45:47 +0300 +Subject: [PATCH 38/40] Support of overlapping src/dst for pixman_blt_mmx + +--- + pixman/pixman-mmx.c | 55 +++++++++++++++++++++++++++++--------------------- + 1 files changed, 32 insertions(+), 23 deletions(-) + +diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c +index 0272347..5bcbd0e 100644 +--- a/pixman/pixman-mmx.c ++++ b/pixman/pixman-mmx.c +@@ -2996,34 +2996,43 @@ pixman_blt_mmx (uint32_t *src_bits, + { + uint8_t * src_bytes; + uint8_t * dst_bytes; +- int byte_width; ++ int bpp; + +- if (src_bpp != dst_bpp) ++ if (src_bpp != dst_bpp || src_bpp & 7) + return FALSE; + +- if (src_bpp == 16) +- { +- src_stride = src_stride * (int) sizeof (uint32_t) / 2; +- dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; +- src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); +- dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); +- byte_width = 2 * width; +- src_stride *= 2; +- dst_stride *= 2; +- } +- else if (src_bpp == 32) ++ bpp = src_bpp >> 3; ++ width *= bpp; ++ src_stride *= 4; ++ dst_stride *= 4; ++ src_bytes = (uint8_t *)src_bits + src_y * src_stride + src_x * bpp; ++ dst_bytes = (uint8_t *)dst_bits + dst_y * dst_stride + dst_x * bpp; ++ ++ if (src_bpp != 16 && src_bpp != 32) + { +- src_stride = src_stride * (int) sizeof (uint32_t) / 4; +- dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; +- src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); +- dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); +- byte_width = 4 * width; +- src_stride *= 4; +- dst_stride *= 4; ++ pixman_blt_helper (src_bytes, dst_bytes, src_stride, dst_stride, ++ width, height); ++ return TRUE; + } +- else ++ ++ if (src_bytes < dst_bytes && src_bytes + src_stride * height > dst_bytes) + { +- return FALSE; ++ src_bytes += src_stride * height - src_stride; ++ dst_bytes += dst_stride * height - dst_stride; ++ dst_stride = -dst_stride; ++ src_stride = -src_stride; ++ ++ if (src_bytes + width > dst_bytes) ++ { ++ /* TODO: reverse scanline copy using MMX */ ++ while (--height >= 0) ++ { ++ memmove (dst_bytes, src_bytes, width); ++ dst_bytes += dst_stride; ++ src_bytes += src_stride; ++ } ++ return TRUE; ++ } + } + + while (height--) +@@ -3033,7 +3042,7 @@ pixman_blt_mmx (uint32_t *src_bits, + uint8_t *d = dst_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; +- w = byte_width; ++ w = width; + + while (w >= 2 && ((unsigned long)d & 3)) + { +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0039-Support-of-overlapping-src-dst-for-pixman_blt_sse2.patch b/recipes/xorg-lib/pixman-0.21.6/0039-Support-of-overlapping-src-dst-for-pixman_blt_sse2.patch new file mode 100644 index 0000000000..8e89ffeabb --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0039-Support-of-overlapping-src-dst-for-pixman_blt_sse2.patch @@ -0,0 +1,91 @@ +From b4064e256d293d32035494a6afff1bc9456b84e1 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Thu, 22 Oct 2009 05:45:54 +0300 +Subject: [PATCH 39/40] Support of overlapping src/dst for pixman_blt_sse2 + +--- + pixman/pixman-sse2.c | 55 +++++++++++++++++++++++++++++-------------------- + 1 files changed, 32 insertions(+), 23 deletions(-) + +diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c +index 533b858..9fa7191 100644 +--- a/pixman/pixman-sse2.c ++++ b/pixman/pixman-sse2.c +@@ -4691,34 +4691,43 @@ pixman_blt_sse2 (uint32_t *src_bits, + { + uint8_t * src_bytes; + uint8_t * dst_bytes; +- int byte_width; ++ int bpp; + +- if (src_bpp != dst_bpp) ++ if (src_bpp != dst_bpp || src_bpp & 7) + return FALSE; + +- if (src_bpp == 16) +- { +- src_stride = src_stride * (int) sizeof (uint32_t) / 2; +- dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; +- src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); +- dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); +- byte_width = 2 * width; +- src_stride *= 2; +- dst_stride *= 2; +- } +- else if (src_bpp == 32) ++ bpp = src_bpp >> 3; ++ width *= bpp; ++ src_stride *= 4; ++ dst_stride *= 4; ++ src_bytes = (uint8_t *)src_bits + src_y * src_stride + src_x * bpp; ++ dst_bytes = (uint8_t *)dst_bits + dst_y * dst_stride + dst_x * bpp; ++ ++ if (src_bpp != 16 && src_bpp != 32) + { +- src_stride = src_stride * (int) sizeof (uint32_t) / 4; +- dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; +- src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); +- dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); +- byte_width = 4 * width; +- src_stride *= 4; +- dst_stride *= 4; ++ pixman_blt_helper (src_bytes, dst_bytes, src_stride, dst_stride, ++ width, height); ++ return TRUE; + } +- else ++ ++ if (src_bytes < dst_bytes && src_bytes + src_stride * height > dst_bytes) + { +- return FALSE; ++ src_bytes += src_stride * height - src_stride; ++ dst_bytes += dst_stride * height - dst_stride; ++ dst_stride = -dst_stride; ++ src_stride = -src_stride; ++ ++ if (src_bytes + width > dst_bytes) ++ { ++ /* TODO: reverse scanline copy using SSE2 */ ++ while (--height >= 0) ++ { ++ memmove (dst_bytes, src_bytes, width); ++ dst_bytes += dst_stride; ++ src_bytes += src_stride; ++ } ++ return TRUE; ++ } + } + + while (height--) +@@ -4728,7 +4737,7 @@ pixman_blt_sse2 (uint32_t *src_bits, + uint8_t *d = dst_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; +- w = byte_width; ++ w = width; + + while (w >= 2 && ((unsigned long)d & 3)) + { +-- +1.6.6.1 + diff --git a/recipes/xorg-lib/pixman-0.21.6/0040-Support-of-overlapping-src-dst-for-pixman_blt_neon.patch b/recipes/xorg-lib/pixman-0.21.6/0040-Support-of-overlapping-src-dst-for-pixman_blt_neon.patch new file mode 100644 index 0000000000..38aeadb2dc --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0040-Support-of-overlapping-src-dst-for-pixman_blt_neon.patch @@ -0,0 +1,94 @@ +From ed32d593a0e8aa56f8a27f976f188d14a79343a0 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Wed, 18 Nov 2009 06:08:48 +0200 +Subject: [PATCH 40/40] Support of overlapping src/dst for pixman_blt_neon + +--- + pixman/pixman-arm-neon.c | 62 +++++++++++++++++++++++++++++++++++++-------- + 1 files changed, 51 insertions(+), 11 deletions(-) + +diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c +index 0a10ca1..f015eee 100644 +--- a/pixman/pixman-arm-neon.c ++++ b/pixman/pixman-arm-neon.c +@@ -215,26 +215,66 @@ pixman_blt_neon (uint32_t *src_bits, + int width, + int height) + { +- if (src_bpp != dst_bpp) ++ uint8_t * src_bytes; ++ uint8_t * dst_bytes; ++ int bpp; ++ ++ if (src_bpp != dst_bpp || src_bpp & 7) + return FALSE; + ++ bpp = src_bpp >> 3; ++ width *= bpp; ++ src_stride *= 4; ++ dst_stride *= 4; ++ src_bytes = (uint8_t *)src_bits + src_y * src_stride + src_x * bpp; ++ dst_bytes = (uint8_t *)dst_bits + dst_y * dst_stride + dst_x * bpp; ++ ++ if (src_bpp != 16 && src_bpp != 32) ++ { ++ pixman_blt_helper (src_bytes, dst_bytes, src_stride, dst_stride, ++ width, height); ++ return TRUE; ++ } ++ ++ if (src_bytes < dst_bytes && src_bytes + src_stride * height > dst_bytes) ++ { ++ src_bytes += src_stride * height - src_stride; ++ dst_bytes += dst_stride * height - dst_stride; ++ dst_stride = -dst_stride; ++ src_stride = -src_stride; ++ ++ if (src_bytes + width > dst_bytes) ++ { ++ /* TODO: reverse scanline copy using NEON */ ++ while (--height >= 0) ++ { ++ memmove (dst_bytes, src_bytes, width); ++ dst_bytes += dst_stride; ++ src_bytes += src_stride; ++ } ++ return TRUE; ++ } ++ } ++ + switch (src_bpp) + { + case 16: + pixman_composite_src_0565_0565_asm_neon ( +- width, height, +- (uint16_t *)(((char *) dst_bits) + +- dst_y * dst_stride * 4 + dst_x * 2), dst_stride * 2, +- (uint16_t *)(((char *) src_bits) + +- src_y * src_stride * 4 + src_x * 2), src_stride * 2); ++ width >> 1, ++ height, ++ (uint16_t *) dst_bytes, ++ dst_stride >> 1, ++ (uint16_t *) src_bytes, ++ src_stride >> 1); + return TRUE; + case 32: + pixman_composite_src_8888_8888_asm_neon ( +- width, height, +- (uint32_t *)(((char *) dst_bits) + +- dst_y * dst_stride * 4 + dst_x * 4), dst_stride, +- (uint32_t *)(((char *) src_bits) + +- src_y * src_stride * 4 + src_x * 4), src_stride); ++ width >> 2, ++ height, ++ (uint32_t *) dst_bytes, ++ dst_stride >> 2, ++ (uint32_t *) src_bytes, ++ src_stride >> 2); + return TRUE; + default: + return FALSE; +-- +1.6.6.1 + |