aboutsummaryrefslogtreecommitdiffstats
path: root/recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch
diff options
context:
space:
mode:
Diffstat (limited to 'recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch')
-rw-r--r--recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch324
1 files changed, 324 insertions, 0 deletions
diff --git a/recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch b/recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch
new file mode 100644
index 0000000000..c544225f65
--- /dev/null
+++ b/recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch
@@ -0,0 +1,324 @@
+From 6494f9ae8820078d0e6109bf8f294156f7a5da4c Mon Sep 17 00:00:00 2001
+From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Fri, 05 Mar 2010 00:40:34 +0000
+Subject: ARM: added 'armv6_composite_src_8888_0565' fast path
+
+Provides ~3x performance improvement when working with
+data in L1 cache, and ~80% performace improvement when working
+with memory. This fast path is important for 32bpp -> 16bpp
+color format conversion and is commonly used with 16bpp desktop.
+
+Microbenchmark from N800 (ARM11 @ 400MHz), measured in MPix/s:
+
+before:
+
+ src_8888_0565 = L1: 21.54 M: 15.62
+
+after (armv4):
+
+ src_8888_0565 = L1: 45.26 M: 23.29
+
+after (armv6):
+
+ src_8888_0565 = L1: 60.62 M: 28.37
+---
+diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
+index c375c01..69243c1 100644
+--- a/pixman/pixman-arm-simd.c
++++ b/pixman/pixman-arm-simd.c
+@@ -604,6 +604,282 @@ armv6_composite_over_n_8_0565 (pixman_implementation_t * impl,
+ dst_stride - width, mask_stride - width, height);
+ }
+
++static inline void
++armv4_composite_src_8888_0565_asm (
++ uint16_t *dst, uint32_t *src, int w, int dst_stride,
++ int src_stride, int h)
++{
++ uint32_t a, x, y, c1F001F = 0x1F001F, cFFFF = 0xFFFF;
++ int backup_w = w;
++ while (h--)
++ {
++ w = backup_w;
++ if (w > 0 && (uintptr_t)dst & 2)
++ {
++ x = *src++;
++
++ a = (x >> 3) & c1F001F;
++ x &= 0xFC00;
++ a |= a >> 5;
++ a |= x >> 5;
++
++ *dst++ = a;
++ w--;
++ }
++
++ asm volatile(
++ "subs %[w], %[w], #2\n"
++ "blt 2f\n"
++ "1:\n"
++ "ldr %[x], [%[src]], #4\n"
++ "ldr %[y], [%[src]], #4\n"
++ "subs %[w], %[w], #2\n"
++
++ "and %[a], %[c1F001F], %[x], lsr #3\n"
++ "and %[x], %[x], #0xFC00\n\n"
++ "orr %[a], %[a], %[a], lsr #5\n"
++ "orr %[x], %[a], %[x], lsr #5\n"
++
++ "and %[a], %[c1F001F], %[y], lsr #3\n"
++ "and %[y], %[y], #0xFC00\n\n"
++ "orr %[a], %[a], %[a], lsr #5\n"
++ "orr %[y], %[a], %[y], lsr #5\n"
++ /*
++ * Writing single 32-bit value is much faster than two
++ * separate 16-bit values for older CPUs without (efficient)
++ * write combining, even though it costs an extra instruction.
++ */
++ "and %[x], %[x], %[cFFFF]\n"
++ "orr %[x], %[x], %[y], lsl #16\n"
++ "str %[x], [%[dst]], #4\n"
++ "bge 1b\n"
++ "2:\n"
++ : [c1F001F] "+&r" (c1F001F), [cFFFF] "+&r" (cFFFF),
++ [src] "+&r" (src), [dst] "+&r" (dst), [a] "=&r" (a),
++ [x] "=&r" (x), [y] "=&r" (y), [w] "+&r" (w)
++ );
++
++ if (w & 1)
++ {
++ x = *src++;
++
++ a = (x >> 3) & c1F001F;
++ x = x & 0xFC00;
++ a |= a >> 5;
++ a |= x >> 5;
++
++ *dst++ = a;
++ }
++
++ src += src_stride - backup_w;
++ dst += dst_stride - backup_w;
++ }
++}
++
++/*
++ * Conversion x8r8g8b8 -> r5g6b5
++ *
++ * Note: 'w' must be >= 7 here
++ */
++static void __attribute__((naked))
++armv6_composite_src_8888_0565_asm (
++ uint16_t *dst, uint32_t *src, int w, int dst_stride,
++ int src_stride, int h)
++{
++ asm volatile(
++ /* define supplementary macros */
++ ".macro cvt8888to565 PIX\n"
++ "and A, C1F001F, \\PIX, lsr #3\n"
++ "and \\PIX, \\PIX, #0xFC00\n\n"
++ "orr A, A, A, lsr #5\n"
++ "orr \\PIX, A, \\PIX, lsr #5\n"
++ ".endm\n"
++
++ ".macro combine_pixels_pair PIX1, PIX2\n"
++ /* Note: assume little endian byte order */
++ "pkhbt \\PIX1, \\PIX1, \\PIX2, lsl #16\n"
++ ".endm\n"
++
++ /* function entry, save all registers (10 words) to stack */
++ "stmdb sp!, {r4-r11, ip, lr}\n"
++
++ /* define some aliases */
++ "DST .req r0\n"
++ "SRC .req r1\n"
++ "W .req r2\n"
++ "H .req r3\n"
++
++ "TMP1 .req r4\n"
++ "TMP2 .req r5\n"
++ "TMP3 .req r6\n"
++ "TMP4 .req r7\n"
++ "TMP5 .req r8\n"
++ "TMP6 .req r9\n"
++ "TMP7 .req r10\n"
++ "TMP8 .req r11\n"
++
++ "C1F001F .req ip\n"
++ "A .req lr\n"
++
++ "ldr TMP1, [sp, #(10*4+0)]\n" /* load src_stride */
++ "ldr C1F001F, =0x1F001F\n"
++ "sub r3, r3, W\n"
++ "str r3, [sp, #(10*4+0)]\n" /* store (dst_stride-w) */
++ "ldr r3, [sp, #(10*4+4)]\n" /* load h */
++ "sub TMP1, TMP1, W\n"
++ "str TMP1, [sp, #(10*4+4)]\n" /* store (src_stride-w) */
++
++ "str W, [sp, #(8*4)]\n" /* saved ip = W */
++
++ "0:\n"
++ "subs H, H, #1\n"
++ "blt 6f\n"
++ "1:\n"
++ /* align DST at 4 byte boundary */
++ "tst DST, #2\n"
++ "beq 2f\n"
++ "ldr TMP1, [SRC], #4\n"
++ "sub W, W, #1\n"
++ "cvt8888to565 TMP1\n"
++ "strh TMP1, [DST], #2\n"
++ "2:"
++ /* align DST at 8 byte boundary */
++ "tst DST, #4\n"
++ "beq 2f\n"
++ "ldmia SRC!, {TMP1, TMP2}\n"
++ "sub W, W, #2\n"
++ "cvt8888to565 TMP1\n"
++ "cvt8888to565 TMP2\n"
++ "combine_pixels_pair TMP1, TMP2\n"
++ "str TMP1, [DST], #4\n"
++ "2:"
++ /* align DST at 16 byte boundary */
++ "tst DST, #8\n"
++ "beq 2f\n"
++ "ldmia SRC!, {TMP1, TMP2, TMP3, TMP4}\n"
++ "sub W, W, #4\n"
++ "cvt8888to565 TMP1\n"
++ "cvt8888to565 TMP2\n"
++ "cvt8888to565 TMP3\n"
++ "cvt8888to565 TMP4\n"
++ "combine_pixels_pair TMP1, TMP2\n"
++ "combine_pixels_pair TMP3, TMP4\n"
++ "stmia DST!, {TMP1, TMP3}\n"
++ "2:"
++ /* inner loop, process 8 pixels per iteration */
++ "subs W, W, #8\n"
++ "blt 4f\n"
++ "3:\n"
++ "ldmia SRC!, {TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8}\n"
++ "subs W, W, #8\n"
++ "cvt8888to565 TMP1\n"
++ "cvt8888to565 TMP2\n"
++ "cvt8888to565 TMP3\n"
++ "cvt8888to565 TMP4\n"
++ "cvt8888to565 TMP5\n"
++ "cvt8888to565 TMP6\n"
++ "cvt8888to565 TMP7\n"
++ "cvt8888to565 TMP8\n"
++ "combine_pixels_pair TMP1, TMP2\n"
++ "combine_pixels_pair TMP3, TMP4\n"
++ "combine_pixels_pair TMP5, TMP6\n"
++ "combine_pixels_pair TMP7, TMP8\n"
++ "stmia DST!, {TMP1, TMP3, TMP5, TMP7}\n"
++ "bge 3b\n"
++ "4:\n"
++
++ /* process the remaining pixels */
++ "tst W, #4\n"
++ "beq 4f\n"
++ "ldmia SRC!, {TMP1, TMP2, TMP3, TMP4}\n"
++ "cvt8888to565 TMP1\n"
++ "cvt8888to565 TMP2\n"
++ "cvt8888to565 TMP3\n"
++ "cvt8888to565 TMP4\n"
++ "combine_pixels_pair TMP1, TMP2\n"
++ "combine_pixels_pair TMP3, TMP4\n"
++ "stmia DST!, {TMP1, TMP3}\n"
++ "4:\n"
++ "tst W, #2\n"
++ "beq 4f\n"
++ "ldmia SRC!, {TMP1, TMP2}\n"
++ "cvt8888to565 TMP1\n"
++ "cvt8888to565 TMP2\n"
++ "combine_pixels_pair TMP1, TMP2\n"
++ "str TMP1, [DST], #4\n"
++ "4:\n"
++ "tst W, #1\n"
++ "beq 4f\n"
++ "ldr TMP1, [SRC], #4\n"
++ "cvt8888to565 TMP1\n"
++ "strh TMP1, [DST], #2\n"
++ "4:\n"
++ "ldr TMP1, [sp, #(10*4+0)]\n" /* (dst_stride-w) */
++ "ldr TMP2, [sp, #(10*4+4)]\n" /* (src_stride-w) */
++ "ldr W, [sp, #(8*4)]\n"
++ "subs H, H, #1\n"
++ "add DST, DST, TMP1, lsl #1\n"
++ "add SRC, SRC, TMP2, lsl #2\n"
++ "bge 1b\n"
++ "6:\n"
++ /* restore all registers and return */
++ "ldmia sp!, {r4-r11, ip, pc}\n"
++ ".ltorg\n"
++
++ ".unreq DST\n"
++ ".unreq SRC\n"
++ ".unreq W\n"
++ ".unreq H\n"
++
++ ".unreq TMP1\n"
++ ".unreq TMP2\n"
++ ".unreq TMP3\n"
++ ".unreq TMP4\n"
++ ".unreq TMP5\n"
++ ".unreq TMP6\n"
++ ".unreq TMP7\n"
++ ".unreq TMP8\n"
++
++ ".unreq C1F001F\n"
++ ".unreq A\n"
++
++ ".purgem cvt8888to565\n"
++ ".purgem combine_pixels_pair\n"
++ );
++}
++
++static void
++armv6_composite_src_8888_0565 (pixman_implementation_t * impl,
++ pixman_op_t op,
++ pixman_image_t * src_image,
++ pixman_image_t * mask_image,
++ pixman_image_t * dst_image,
++ int32_t src_x,
++ int32_t src_y,
++ int32_t mask_x,
++ int32_t mask_y,
++ int32_t dest_x,
++ int32_t dest_y,
++ int32_t width,
++ int32_t height)
++{
++ uint32_t *src;
++ uint16_t *dst;
++ int src_stride, dst_stride;
++
++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
++ dst_stride, dst, 1);
++ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t,
++ src_stride, src, 1);
++
++ if (width < 7)
++ armv4_composite_src_8888_0565_asm (dst, src, width,
++ dst_stride, src_stride, height);
++ else
++ armv6_composite_src_8888_0565_asm (dst, src, width,
++ dst_stride, src_stride, height);
++}
++
+ #endif
+
+ static const pixman_fast_path_t arm_simd_fast_paths[] =
+@@ -624,6 +900,10 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
+ #if defined(__ARM_EABI__) && defined(__linux__)
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, armv6_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, armv6_composite_over_n_8_0565),
++ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, armv6_composite_src_8888_0565),
++ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, armv6_composite_src_8888_0565),
++ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, armv6_composite_src_8888_0565),
++ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, armv6_composite_src_8888_0565),
+ #endif
+ { PIXMAN_OP_NONE },
+ };
+--
+cgit v0.8.3-6-g21f6