1 files changed, 324 insertions, 0 deletions
diff --git a/recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch b/recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch
new file mode 100644
index 0000000000..c544225f65
--- /dev/null
+++ b/recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch
@@ -0,0 +1,324 @@
+From 6494f9ae8820078d0e6109bf8f294156f7a5da4c Mon Sep 17 00:00:00 2001
+From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Fri, 05 Mar 2010 00:40:34 +0000
+Subject: ARM: added 'armv6_composite_src_8888_0565' fast path
+
+Provides ~3x performance improvement when working with
+data in L1 cache, and ~80% performace improvement when working
+with memory. This fast path is important for 32bpp -> 16bpp
+color format conversion and is commonly used with 16bpp desktop.
+
+Microbenchmark from N800 (ARM11 @ 400MHz), measured in MPix/s:
+
+before:
+
+ src_8888_0565 = L1:  21.54 M: 15.62
+
+after (armv4):
+
+ src_8888_0565 = L1:  45.26 M: 23.29
+
+after (armv6):
+
+ src_8888_0565 = L1:  60.62 M: 28.37
+---
+diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
+index c375c01..69243c1 100644
+--- a/pixman/pixman-arm-simd.c
++++ b/pixman/pixman-arm-simd.c
+@@ -604,6 +604,282 @@ armv6_composite_over_n_8_0565 (pixman_implementation_t * impl,
+ 	dst_stride - width, mask_stride - width, height);
+ }
+ 
++static inline void
++armv4_composite_src_8888_0565_asm (
++    uint16_t *dst, uint32_t *src, int w, int dst_stride,
++    int src_stride, int h)
++{
++    uint32_t a, x, y, c1F001F = 0x1F001F, cFFFF = 0xFFFF;
++    int backup_w = w;
++    while (h--)
++    {
++        w = backup_w;
++        if (w > 0 && (uintptr_t)dst & 2)
++        {
++            x = *src++;
++
++            a = (x >> 3) & c1F001F;
++            x &= 0xFC00;
++            a |= a >> 5;
++            a |= x >> 5;
++
++            *dst++ = a;
++            w--;
++        }
++
++        asm volatile(
++            "subs  %[w], %[w], #2\n"
++            "blt   2f\n"
++        "1:\n"
++            "ldr   %[x], [%[src]], #4\n"
++            "ldr   %[y], [%[src]], #4\n"
++            "subs  %[w], %[w], #2\n"
++
++            "and   %[a], %[c1F001F], %[x], lsr #3\n"
++            "and   %[x], %[x], #0xFC00\n\n"
++            "orr   %[a], %[a], %[a], lsr #5\n"
++            "orr   %[x], %[a], %[x], lsr #5\n"
++
++            "and   %[a], %[c1F001F], %[y], lsr #3\n"
++            "and   %[y], %[y], #0xFC00\n\n"
++            "orr   %[a], %[a], %[a], lsr #5\n"
++            "orr   %[y], %[a], %[y], lsr #5\n"
++            /*
++             * Writing single 32-bit value is much faster than two
++             * separate 16-bit values for older CPUs without (efficient)
++             * write combining, even though it costs an extra instruction.
++             */
++            "and   %[x], %[x], %[cFFFF]\n"
++            "orr   %[x], %[x], %[y], lsl #16\n"
++            "str   %[x], [%[dst]], #4\n"
++            "bge   1b\n"
++        "2:\n"
++        : [c1F001F] "+&r" (c1F001F), [cFFFF] "+&r" (cFFFF),
++          [src] "+&r" (src), [dst] "+&r" (dst), [a] "=&r" (a),
++          [x] "=&r" (x), [y] "=&r" (y), [w] "+&r" (w)
++        );
++
++        if (w & 1)
++        {
++            x = *src++;
++
++            a = (x >> 3) & c1F001F;
++            x = x & 0xFC00;
++            a |= a >> 5;
++            a |= x >> 5;
++
++            *dst++ = a;
++        }
++
++        src += src_stride - backup_w;
++        dst += dst_stride - backup_w;
++    }
++}
++
++/*
++ * Conversion x8r8g8b8 -> r5g6b5
++ *
++ * Note: 'w' must be >= 7 here
++ */
++static void __attribute__((naked))
++armv6_composite_src_8888_0565_asm (
++    uint16_t *dst, uint32_t *src, int w, int dst_stride,
++    int src_stride, int h)
++{
++    asm volatile(
++        /* define supplementary macros */
++        ".macro cvt8888to565 PIX\n"
++            "and   A, C1F001F, \\PIX, lsr #3\n"
++            "and   \\PIX, \\PIX, #0xFC00\n\n"
++            "orr   A, A, A, lsr #5\n"
++            "orr   \\PIX, A, \\PIX, lsr #5\n"
++        ".endm\n"
++
++        ".macro combine_pixels_pair PIX1, PIX2\n"
++            /* Note: assume little endian byte order */
++            "pkhbt \\PIX1, \\PIX1, \\PIX2, lsl #16\n"
++        ".endm\n"
++
++        /* function entry, save all registers (10 words) to stack */
++        "stmdb   sp!, {r4-r11, ip, lr}\n"
++
++        /* define some aliases */
++        "DST     .req  r0\n"
++        "SRC     .req  r1\n"
++        "W       .req  r2\n"
++        "H       .req  r3\n"
++
++        "TMP1    .req  r4\n"
++        "TMP2    .req  r5\n"
++        "TMP3    .req  r6\n"
++        "TMP4    .req  r7\n"
++        "TMP5    .req  r8\n"
++        "TMP6    .req  r9\n"
++        "TMP7    .req  r10\n"
++        "TMP8    .req  r11\n"
++
++        "C1F001F .req  ip\n"
++        "A       .req  lr\n"
++
++        "ldr     TMP1, [sp, #(10*4+0)]\n" /* load src_stride */
++        "ldr     C1F001F, =0x1F001F\n"
++        "sub     r3, r3, W\n"
++        "str     r3, [sp, #(10*4+0)]\n" /* store (dst_stride-w) */
++        "ldr     r3, [sp, #(10*4+4)]\n" /* load h */
++        "sub     TMP1, TMP1, W\n"
++        "str     TMP1, [sp, #(10*4+4)]\n" /* store (src_stride-w) */
++
++        "str     W, [sp, #(8*4)]\n" /* saved ip = W */
++
++    "0:\n"
++        "subs    H, H, #1\n"
++        "blt     6f\n"
++    "1:\n"
++        /* align DST at 4 byte boundary */
++        "tst     DST, #2\n"
++        "beq     2f\n"
++        "ldr     TMP1, [SRC], #4\n"
++        "sub     W, W, #1\n"
++        "cvt8888to565 TMP1\n"
++        "strh    TMP1, [DST], #2\n"
++    "2:"
++        /* align DST at 8 byte boundary */
++        "tst     DST, #4\n"
++        "beq     2f\n"
++        "ldmia   SRC!, {TMP1, TMP2}\n"
++        "sub     W, W, #2\n"
++        "cvt8888to565 TMP1\n"
++        "cvt8888to565 TMP2\n"
++        "combine_pixels_pair TMP1, TMP2\n"
++        "str     TMP1, [DST], #4\n"
++    "2:"
++        /* align DST at 16 byte boundary */
++        "tst     DST, #8\n"
++        "beq     2f\n"
++        "ldmia   SRC!, {TMP1, TMP2, TMP3, TMP4}\n"
++        "sub     W, W, #4\n"
++        "cvt8888to565 TMP1\n"
++        "cvt8888to565 TMP2\n"
++        "cvt8888to565 TMP3\n"
++        "cvt8888to565 TMP4\n"
++        "combine_pixels_pair TMP1, TMP2\n"
++        "combine_pixels_pair TMP3, TMP4\n"
++        "stmia DST!, {TMP1, TMP3}\n"
++    "2:"
++        /* inner loop, process 8 pixels per iteration */
++        "subs    W, W, #8\n"
++        "blt     4f\n"
++    "3:\n"
++        "ldmia   SRC!, {TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8}\n"
++        "subs    W, W, #8\n"
++        "cvt8888to565 TMP1\n"
++        "cvt8888to565 TMP2\n"
++        "cvt8888to565 TMP3\n"
++        "cvt8888to565 TMP4\n"
++        "cvt8888to565 TMP5\n"
++        "cvt8888to565 TMP6\n"
++        "cvt8888to565 TMP7\n"
++        "cvt8888to565 TMP8\n"
++        "combine_pixels_pair TMP1, TMP2\n"
++        "combine_pixels_pair TMP3, TMP4\n"
++        "combine_pixels_pair TMP5, TMP6\n"
++        "combine_pixels_pair TMP7, TMP8\n"
++        "stmia   DST!, {TMP1, TMP3, TMP5, TMP7}\n"
++        "bge     3b\n"
++    "4:\n"
++
++        /* process the remaining pixels */
++        "tst     W, #4\n"
++        "beq     4f\n"
++        "ldmia   SRC!, {TMP1, TMP2, TMP3, TMP4}\n"
++        "cvt8888to565 TMP1\n"
++        "cvt8888to565 TMP2\n"
++        "cvt8888to565 TMP3\n"
++        "cvt8888to565 TMP4\n"
++        "combine_pixels_pair TMP1, TMP2\n"
++        "combine_pixels_pair TMP3, TMP4\n"
++        "stmia   DST!, {TMP1, TMP3}\n"
++    "4:\n"
++        "tst     W, #2\n"
++        "beq     4f\n"
++        "ldmia   SRC!, {TMP1, TMP2}\n"
++        "cvt8888to565 TMP1\n"
++        "cvt8888to565 TMP2\n"
++        "combine_pixels_pair TMP1, TMP2\n"
++        "str     TMP1, [DST], #4\n"
++    "4:\n"
++        "tst     W, #1\n"
++        "beq     4f\n"
++        "ldr     TMP1, [SRC], #4\n"
++        "cvt8888to565 TMP1\n"
++        "strh    TMP1, [DST], #2\n"
++    "4:\n"
++        "ldr     TMP1, [sp, #(10*4+0)]\n" /* (dst_stride-w) */
++        "ldr     TMP2, [sp, #(10*4+4)]\n" /* (src_stride-w) */
++        "ldr     W, [sp, #(8*4)]\n"
++        "subs    H, H, #1\n"
++        "add     DST, DST, TMP1, lsl #1\n"
++        "add     SRC, SRC, TMP2, lsl #2\n"
++        "bge     1b\n"
++    "6:\n"
++        /* restore all registers and return */
++        "ldmia   sp!, {r4-r11, ip, pc}\n"
++        ".ltorg\n"
++
++        ".unreq   DST\n"
++        ".unreq   SRC\n"
++        ".unreq   W\n"
++        ".unreq   H\n"
++
++        ".unreq   TMP1\n"
++        ".unreq   TMP2\n"
++        ".unreq   TMP3\n"
++        ".unreq   TMP4\n"
++        ".unreq   TMP5\n"
++        ".unreq   TMP6\n"
++        ".unreq   TMP7\n"
++        ".unreq   TMP8\n"
++
++        ".unreq   C1F001F\n"
++        ".unreq   A\n"
++
++        ".purgem  cvt8888to565\n"
++        ".purgem  combine_pixels_pair\n"
++    );
++}
++
++static void
++armv6_composite_src_8888_0565 (pixman_implementation_t * impl,
++			       pixman_op_t               op,
++			       pixman_image_t *          src_image,
++			       pixman_image_t *          mask_image,
++			       pixman_image_t *          dst_image,
++			       int32_t                   src_x,
++			       int32_t                   src_y,
++			       int32_t                   mask_x,
++			       int32_t                   mask_y,
++			       int32_t                   dest_x,
++			       int32_t                   dest_y,
++			       int32_t                   width,
++			       int32_t                   height)
++{
++    uint32_t *src;
++    uint16_t *dst;
++    int src_stride, dst_stride;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
++			   dst_stride, dst, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t,
++			   src_stride, src, 1);
++
++    if (width < 7)
++	armv4_composite_src_8888_0565_asm (dst, src, width,
++					   dst_stride, src_stride, height);
++    else
++	armv6_composite_src_8888_0565_asm (dst, src, width,
++					   dst_stride, src_stride, height);
++}
++
+ #endif
+ 
+ static const pixman_fast_path_t arm_simd_fast_paths[] =
+@@ -624,6 +900,10 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
+ #if defined(__ARM_EABI__) && defined(__linux__)
+     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, armv6_composite_over_n_8_0565),
+     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, armv6_composite_over_n_8_0565),
++    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, armv6_composite_src_8888_0565),
++    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, armv6_composite_src_8888_0565),
++    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, armv6_composite_src_8888_0565),
++    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, armv6_composite_src_8888_0565),
+ #endif
+     { PIXMAN_OP_NONE },
+ };
+--
+cgit v0.8.3-6-g21f6