diff options
Diffstat (limited to 'recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch')
-rw-r--r-- | recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch | 324 |
1 files changed, 324 insertions, 0 deletions
diff --git a/recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch b/recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch new file mode 100644 index 0000000000..c544225f65 --- /dev/null +++ b/recipes/obsolete/xorg/xorg-lib/pixman/src-8888-0565.patch @@ -0,0 +1,324 @@ +From 6494f9ae8820078d0e6109bf8f294156f7a5da4c Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Fri, 05 Mar 2010 00:40:34 +0000 +Subject: ARM: added 'armv6_composite_src_8888_0565' fast path + +Provides ~3x performance improvement when working with +data in L1 cache, and ~80% performace improvement when working +with memory. This fast path is important for 32bpp -> 16bpp +color format conversion and is commonly used with 16bpp desktop. + +Microbenchmark from N800 (ARM11 @ 400MHz), measured in MPix/s: + +before: + + src_8888_0565 = L1: 21.54 M: 15.62 + +after (armv4): + + src_8888_0565 = L1: 45.26 M: 23.29 + +after (armv6): + + src_8888_0565 = L1: 60.62 M: 28.37 +--- +diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c +index c375c01..69243c1 100644 +--- a/pixman/pixman-arm-simd.c ++++ b/pixman/pixman-arm-simd.c +@@ -604,6 +604,282 @@ armv6_composite_over_n_8_0565 (pixman_implementation_t * impl, + dst_stride - width, mask_stride - width, height); + } + ++static inline void ++armv4_composite_src_8888_0565_asm ( ++ uint16_t *dst, uint32_t *src, int w, int dst_stride, ++ int src_stride, int h) ++{ ++ uint32_t a, x, y, c1F001F = 0x1F001F, cFFFF = 0xFFFF; ++ int backup_w = w; ++ while (h--) ++ { ++ w = backup_w; ++ if (w > 0 && (uintptr_t)dst & 2) ++ { ++ x = *src++; ++ ++ a = (x >> 3) & c1F001F; ++ x &= 0xFC00; ++ a |= a >> 5; ++ a |= x >> 5; ++ ++ *dst++ = a; ++ w--; ++ } ++ ++ asm volatile( ++ "subs %[w], %[w], #2\n" ++ "blt 2f\n" ++ "1:\n" ++ "ldr %[x], [%[src]], #4\n" ++ "ldr %[y], [%[src]], #4\n" ++ "subs %[w], %[w], #2\n" ++ ++ "and %[a], %[c1F001F], %[x], lsr #3\n" ++ "and %[x], %[x], #0xFC00\n\n" ++ "orr %[a], %[a], %[a], lsr #5\n" ++ "orr %[x], %[a], %[x], lsr #5\n" ++ ++ "and %[a], %[c1F001F], %[y], lsr #3\n" ++ "and %[y], %[y], #0xFC00\n\n" ++ "orr %[a], %[a], %[a], lsr #5\n" ++ "orr %[y], %[a], %[y], lsr #5\n" ++ /* ++ * Writing single 32-bit value is much faster than two ++ * separate 16-bit values for older CPUs without (efficient) ++ * write combining, even though it costs an extra instruction. ++ */ ++ "and %[x], %[x], %[cFFFF]\n" ++ "orr %[x], %[x], %[y], lsl #16\n" ++ "str %[x], [%[dst]], #4\n" ++ "bge 1b\n" ++ "2:\n" ++ : [c1F001F] "+&r" (c1F001F), [cFFFF] "+&r" (cFFFF), ++ [src] "+&r" (src), [dst] "+&r" (dst), [a] "=&r" (a), ++ [x] "=&r" (x), [y] "=&r" (y), [w] "+&r" (w) ++ ); ++ ++ if (w & 1) ++ { ++ x = *src++; ++ ++ a = (x >> 3) & c1F001F; ++ x = x & 0xFC00; ++ a |= a >> 5; ++ a |= x >> 5; ++ ++ *dst++ = a; ++ } ++ ++ src += src_stride - backup_w; ++ dst += dst_stride - backup_w; ++ } ++} ++ ++/* ++ * Conversion x8r8g8b8 -> r5g6b5 ++ * ++ * Note: 'w' must be >= 7 here ++ */ ++static void __attribute__((naked)) ++armv6_composite_src_8888_0565_asm ( ++ uint16_t *dst, uint32_t *src, int w, int dst_stride, ++ int src_stride, int h) ++{ ++ asm volatile( ++ /* define supplementary macros */ ++ ".macro cvt8888to565 PIX\n" ++ "and A, C1F001F, \\PIX, lsr #3\n" ++ "and \\PIX, \\PIX, #0xFC00\n\n" ++ "orr A, A, A, lsr #5\n" ++ "orr \\PIX, A, \\PIX, lsr #5\n" ++ ".endm\n" ++ ++ ".macro combine_pixels_pair PIX1, PIX2\n" ++ /* Note: assume little endian byte order */ ++ "pkhbt \\PIX1, \\PIX1, \\PIX2, lsl #16\n" ++ ".endm\n" ++ ++ /* function entry, save all registers (10 words) to stack */ ++ "stmdb sp!, {r4-r11, ip, lr}\n" ++ ++ /* define some aliases */ ++ "DST .req r0\n" ++ "SRC .req r1\n" ++ "W .req r2\n" ++ "H .req r3\n" ++ ++ "TMP1 .req r4\n" ++ "TMP2 .req r5\n" ++ "TMP3 .req r6\n" ++ "TMP4 .req r7\n" ++ "TMP5 .req r8\n" ++ "TMP6 .req r9\n" ++ "TMP7 .req r10\n" ++ "TMP8 .req r11\n" ++ ++ "C1F001F .req ip\n" ++ "A .req lr\n" ++ ++ "ldr TMP1, [sp, #(10*4+0)]\n" /* load src_stride */ ++ "ldr C1F001F, =0x1F001F\n" ++ "sub r3, r3, W\n" ++ "str r3, [sp, #(10*4+0)]\n" /* store (dst_stride-w) */ ++ "ldr r3, [sp, #(10*4+4)]\n" /* load h */ ++ "sub TMP1, TMP1, W\n" ++ "str TMP1, [sp, #(10*4+4)]\n" /* store (src_stride-w) */ ++ ++ "str W, [sp, #(8*4)]\n" /* saved ip = W */ ++ ++ "0:\n" ++ "subs H, H, #1\n" ++ "blt 6f\n" ++ "1:\n" ++ /* align DST at 4 byte boundary */ ++ "tst DST, #2\n" ++ "beq 2f\n" ++ "ldr TMP1, [SRC], #4\n" ++ "sub W, W, #1\n" ++ "cvt8888to565 TMP1\n" ++ "strh TMP1, [DST], #2\n" ++ "2:" ++ /* align DST at 8 byte boundary */ ++ "tst DST, #4\n" ++ "beq 2f\n" ++ "ldmia SRC!, {TMP1, TMP2}\n" ++ "sub W, W, #2\n" ++ "cvt8888to565 TMP1\n" ++ "cvt8888to565 TMP2\n" ++ "combine_pixels_pair TMP1, TMP2\n" ++ "str TMP1, [DST], #4\n" ++ "2:" ++ /* align DST at 16 byte boundary */ ++ "tst DST, #8\n" ++ "beq 2f\n" ++ "ldmia SRC!, {TMP1, TMP2, TMP3, TMP4}\n" ++ "sub W, W, #4\n" ++ "cvt8888to565 TMP1\n" ++ "cvt8888to565 TMP2\n" ++ "cvt8888to565 TMP3\n" ++ "cvt8888to565 TMP4\n" ++ "combine_pixels_pair TMP1, TMP2\n" ++ "combine_pixels_pair TMP3, TMP4\n" ++ "stmia DST!, {TMP1, TMP3}\n" ++ "2:" ++ /* inner loop, process 8 pixels per iteration */ ++ "subs W, W, #8\n" ++ "blt 4f\n" ++ "3:\n" ++ "ldmia SRC!, {TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8}\n" ++ "subs W, W, #8\n" ++ "cvt8888to565 TMP1\n" ++ "cvt8888to565 TMP2\n" ++ "cvt8888to565 TMP3\n" ++ "cvt8888to565 TMP4\n" ++ "cvt8888to565 TMP5\n" ++ "cvt8888to565 TMP6\n" ++ "cvt8888to565 TMP7\n" ++ "cvt8888to565 TMP8\n" ++ "combine_pixels_pair TMP1, TMP2\n" ++ "combine_pixels_pair TMP3, TMP4\n" ++ "combine_pixels_pair TMP5, TMP6\n" ++ "combine_pixels_pair TMP7, TMP8\n" ++ "stmia DST!, {TMP1, TMP3, TMP5, TMP7}\n" ++ "bge 3b\n" ++ "4:\n" ++ ++ /* process the remaining pixels */ ++ "tst W, #4\n" ++ "beq 4f\n" ++ "ldmia SRC!, {TMP1, TMP2, TMP3, TMP4}\n" ++ "cvt8888to565 TMP1\n" ++ "cvt8888to565 TMP2\n" ++ "cvt8888to565 TMP3\n" ++ "cvt8888to565 TMP4\n" ++ "combine_pixels_pair TMP1, TMP2\n" ++ "combine_pixels_pair TMP3, TMP4\n" ++ "stmia DST!, {TMP1, TMP3}\n" ++ "4:\n" ++ "tst W, #2\n" ++ "beq 4f\n" ++ "ldmia SRC!, {TMP1, TMP2}\n" ++ "cvt8888to565 TMP1\n" ++ "cvt8888to565 TMP2\n" ++ "combine_pixels_pair TMP1, TMP2\n" ++ "str TMP1, [DST], #4\n" ++ "4:\n" ++ "tst W, #1\n" ++ "beq 4f\n" ++ "ldr TMP1, [SRC], #4\n" ++ "cvt8888to565 TMP1\n" ++ "strh TMP1, [DST], #2\n" ++ "4:\n" ++ "ldr TMP1, [sp, #(10*4+0)]\n" /* (dst_stride-w) */ ++ "ldr TMP2, [sp, #(10*4+4)]\n" /* (src_stride-w) */ ++ "ldr W, [sp, #(8*4)]\n" ++ "subs H, H, #1\n" ++ "add DST, DST, TMP1, lsl #1\n" ++ "add SRC, SRC, TMP2, lsl #2\n" ++ "bge 1b\n" ++ "6:\n" ++ /* restore all registers and return */ ++ "ldmia sp!, {r4-r11, ip, pc}\n" ++ ".ltorg\n" ++ ++ ".unreq DST\n" ++ ".unreq SRC\n" ++ ".unreq W\n" ++ ".unreq H\n" ++ ++ ".unreq TMP1\n" ++ ".unreq TMP2\n" ++ ".unreq TMP3\n" ++ ".unreq TMP4\n" ++ ".unreq TMP5\n" ++ ".unreq TMP6\n" ++ ".unreq TMP7\n" ++ ".unreq TMP8\n" ++ ++ ".unreq C1F001F\n" ++ ".unreq A\n" ++ ++ ".purgem cvt8888to565\n" ++ ".purgem combine_pixels_pair\n" ++ ); ++} ++ ++static void ++armv6_composite_src_8888_0565 (pixman_implementation_t * impl, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint32_t *src; ++ uint16_t *dst; ++ int src_stride, dst_stride; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, ++ dst_stride, dst, 1); ++ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, ++ src_stride, src, 1); ++ ++ if (width < 7) ++ armv4_composite_src_8888_0565_asm (dst, src, width, ++ dst_stride, src_stride, height); ++ else ++ armv6_composite_src_8888_0565_asm (dst, src, width, ++ dst_stride, src_stride, height); ++} ++ + #endif + + static const pixman_fast_path_t arm_simd_fast_paths[] = +@@ -624,6 +900,10 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = + #if defined(__ARM_EABI__) && defined(__linux__) + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, armv6_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, armv6_composite_over_n_8_0565), ++ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, armv6_composite_src_8888_0565), ++ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, armv6_composite_src_8888_0565), ++ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, armv6_composite_src_8888_0565), ++ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, armv6_composite_src_8888_0565), + #endif + { PIXMAN_OP_NONE }, + }; +-- +cgit v0.8.3-6-g21f6 |