From 5366c510565553f17b28ed936b34dfa4d3026b21 Mon Sep 17 00:00:00 2001 From: Koen Kooi Date: Thu, 3 Sep 2009 21:33:55 +0200 Subject: pixman git: add some more NEON and fastpath patches --- recipes/xorg-lib/pixman/over-8888-0565.patch | 296 +++++++++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 recipes/xorg-lib/pixman/over-8888-0565.patch (limited to 'recipes/xorg-lib/pixman/over-8888-0565.patch') diff --git a/recipes/xorg-lib/pixman/over-8888-0565.patch b/recipes/xorg-lib/pixman/over-8888-0565.patch new file mode 100644 index 0000000000..3e27094022 --- /dev/null +++ b/recipes/xorg-lib/pixman/over-8888-0565.patch @@ -0,0 +1,296 @@ +From: Siarhei Siamashka +Date: Mon, 27 Jul 2009 04:48:04 +0000 (+0300) +Subject: ARM: NEON optimized version of composite_over_8888_0565 +X-Git-Url: http://siarhei.siamashka.name/gitweb/?p=pixman.git;a=commitdiff_plain;h=17d8ab82858511f212dfb30c347255393eb12b0c + +ARM: NEON optimized version of composite_over_8888_0565 +--- + +diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c +index 9404c70..f1dcf1f 100644 +--- a/pixman/pixman-arm-neon.c ++++ b/pixman/pixman-arm-neon.c +@@ -1447,6 +1447,274 @@ neon_composite_src_16_16 (pixman_implementation_t * impl, + } + } + ++static inline void ++neon_composite_over_8888_0565_internal (uint32_t *src, ++ uint16_t *dst, ++ int32_t w, ++ int32_t h, ++ int32_t src_stride, ++ int32_t dst_stride) ++{ ++ int32_t dst_newline_delta = (dst_stride - w) * 2; ++ int32_t src_newline_delta = (src_stride - w) * 4; ++ asm volatile ( ++ ++ ".macro process_pixblock_head size\n" ++ /* load pixel data from memory */ ++ " .if \\size == 8\n" ++ " vld1.32 {d0, d1, d2, d3}, [%[src]]!\n" ++ " vld1.16 {d4, d5}, [%[dst_r]]!\n" ++ " .elseif \\size == 4\n" ++ " vld1.32 {d0, d1}, [%[src]]!\n" ++ " vld1.16 {d4}, [%[dst_r]]!\n" ++ " .elseif \\size == 2\n" ++ " vld1.32 {d0}, [%[src]]!\n" ++ " vld1.16 {d4[0]}, [%[dst_r]]!\n" ++ " vld1.16 {d4[1]}, [%[dst_r]]!\n" ++ " .elseif \\size == 1\n" ++ " vld1.32 {d0[0]}, [%[src]]!\n" ++ " vld1.16 {d4[0]}, [%[dst_r]]!\n" ++ " .endif\n" ++ /* deinterleave and convert both source and destination ++ to "planar" 8-bit format */ ++ " vshrn.u16 d16, q2, #8\n" ++ " vuzp.8 d0, d1\n" ++ " vshrn.u16 d17, q2, #3\n" ++ " vuzp.8 d2, d3\n" ++ " vsli.u16 q2, q2, #5\n" ++ " vuzp.8 d1, d3\n" ++ " vsri.u8 d16, d16, #5\n" ++ " vuzp.8 d0, d2\n" ++ " vmvn.8 d3, d3\n" ++ " vsri.u8 d17, d17, #6\n" ++ " vshrn.u16 d18, q2, #2\n" ++ /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ ++ /* destination: d16 - red, d17 - green, d18 - blue */ ++ /* now do alpha blending */ ++ " vmull.u8 q10, d3, d16\n" ++ "pld [%[src], #128]\n" ++ " vmull.u8 q11, d3, d17\n" ++ "pld [%[dst_r], #64]\n" ++ " vmull.u8 q12, d3, d18\n" ++ " vrshr.u16 q13, q10, #8\n" ++ " vrshr.u16 q8, q11, #8\n" ++ " vrshr.u16 q9, q12, #8\n" ++ " vraddhn.u16 d20, q10, q13\n" ++ " vraddhn.u16 d21, q11, q8\n" ++ " vraddhn.u16 d22, q12, q9\n" ++ ".endm\n" ++ ++ ".macro process_pixblock_tail size\n" ++ /* result is ready in d28, d29, d30 (R, G, B) */ ++ " vqadd.u8 d28, d2, d20\n" ++ " vqadd.u8 d29, d1, d21\n" ++ " vqadd.u8 d30, d0, d22\n" ++ /* convert it to r5g6b5 */ ++ " vshll.u8 q3, d28, #8\n" ++ " vshll.u8 q14, d29, #8\n" ++ " vshll.u8 q15, d30, #8\n" ++ " vsri.u16 q3, q14, #5\n" ++ " vsri.u16 q3, q15, #11\n" ++ /* store pixel data to memory */ ++ " .if \\size == 8\n" ++ " vst1.16 {d6, d7}, [%[dst_w], :128]!\n" ++ " .elseif \\size == 4\n" ++ " vst1.16 {d6}, [%[dst_w]]!\n" ++ " .elseif \\size == 2\n" ++ " vst1.16 {d6[0]}, [%[dst_w]]!\n" ++ " vst1.16 {d6[1]}, [%[dst_w]]!\n" ++ " .elseif \\size == 1\n" ++ " vst1.16 {d6[0]}, [%[dst_w]]!\n" ++ " .endif\n" ++ ".endm\n" ++ ++ /* "tail" of the previous block and "head" of the next block ++ are merged and interleaved for better instructions scheduling */ ++ ".macro process_pixblock_tail_head_8\n" ++ " vqadd.u8 d28, d2, d20\n" ++ " vld1.16 {d4, d5}, [%[dst_r], :128]!\n" ++ " vqadd.u8 d29, d1, d21\n" /* TODO: try to join these into a */ ++ " vqadd.u8 d30, d0, d22\n" /* single 128-bit operation */ ++ " vshrn.u16 d16, q2, #8\n" ++ " vld1.32 {d0, d1, d2, d3}, [%[src]]!\n" /* TODO: maybe split */ ++ " vshrn.u16 d17, q2, #3\n" ++ " vsli.u16 q2, q2, #5\n" ++ " vuzp.8 d0, d1\n" ++ " vshll.u8 q3, d28, #8\n" ++ " vuzp.8 d2, d3\n" ++ " vshll.u8 q14, d29, #8\n" ++ " vuzp.8 d1, d3\n" ++ " vsri.u8 d16, d16, #5\n" ++ " vuzp.8 d0, d2\n" ++ " vmvn.8 d3, d3\n" ++ " vsri.u8 d17, d17, #6\n" ++ " vshrn.u16 d18, q2, #2\n" ++ " vmull.u8 q10, d3, d16\n" ++ "pld [%[src], #128]\n" ++ " vmull.u8 q11, d3, d17\n" ++ "pld [%[dst_r], #64]\n" ++ " vmull.u8 q12, d3, d18\n" ++ " vsri.u16 d6, d28, #5\n" ++ " vsri.u16 d7, d29, #5\n" ++ " vshll.u8 q15, d30, #8\n" ++ " vrshr.u16 q13, q10, #8\n" ++ " vrshr.u16 q8, q11, #8\n" ++ " vrshr.u16 q9, q12, #8\n" ++ " vsri.u16 d6, d30, #11\n" ++ " vsri.u16 d7, d31, #11\n" ++ " vraddhn.u16 d20, q10, q13\n" ++ " vraddhn.u16 d21, q11, q8\n" ++ " vraddhn.u16 d22, q12, q9\n" ++ " vst1.16 {d6, d7}, [%[dst_w], :128]!\n" ++ ".endm\n" ++ ++ "subs %[h], %[h], #1\n" ++ "blt 9f\n" ++ "0:\n" ++ "cmp %[w], #8\n" ++ "blt 8f\n" ++ ++ /* ensure 16 byte alignment of the destination buffer */ ++ "tst %[dst_r], #0xF\n" ++ "beq 2f\n" ++ "tst %[dst_r], #2\n" ++ "beq 1f\n" ++ "vld1.32 {d3[0]}, [%[src]]!\n" ++ "vld1.16 {d5[2]}, [%[dst_r]]!\n" ++ "sub %[w], %[w], #1\n" ++ "1:\n" ++ "tst %[dst_r], #4\n" ++ "beq 1f\n" ++ "vld1.32 {d2}, [%[src]]!\n" ++ "vld1.16 {d5[0]}, [%[dst_r]]!\n" ++ "vld1.16 {d5[1]}, [%[dst_r]]!\n" ++ "sub %[w], %[w], #2\n" ++ "1:\n" ++ "tst %[dst_r], #8\n" ++ "beq 1f\n" ++ "vld1.32 {d0, d1}, [%[src]]!\n" ++ "vld1.16 {d4}, [%[dst_r]]!\n" ++ "sub %[w], %[w], #4\n" ++ "1:\n" ++ "process_pixblock_head -1\n" ++ "process_pixblock_tail -1\n" ++ "tst %[dst_w], #2\n" ++ "beq 1f\n" ++ "vst1.16 {d7[2]}, [%[dst_w]]!\n" ++ "1:\n" ++ "tst %[dst_w], #4\n" ++ "beq 1f\n" ++ "vst1.16 {d7[0]}, [%[dst_w]]!\n" ++ "vst1.16 {d7[1]}, [%[dst_w]]!\n" ++ "1:\n" ++ "tst %[dst_w], #8\n" ++ "beq 2f\n" ++ "vst1.16 {d6}, [%[dst_w]]!\n" ++ "2:\n" ++ ++ "subs %[w], %[w], #8\n" ++ "blt 8f\n" ++ "process_pixblock_head 8\n" ++ "subs %[w], %[w], #8\n" ++ "blt 2f\n" ++ "1:\n" /* innermost pipelined loop */ ++ "process_pixblock_tail_head_8\n" ++ "subs %[w], %[w], #8\n" ++ "bge 1b\n" ++ "2:\n" ++ "process_pixblock_tail 8\n" ++ ++ "8:\n" ++ /* process up to 7 remaining pixels */ ++ "tst %[w], #7\n" ++ "beq 2f\n" ++ "tst %[w], #4\n" ++ "beq 1f\n" ++ "vld1.32 {d0, d1}, [%[src]]!\n" ++ "vld1.16 {d4}, [%[dst_r]]!\n" ++ "1:\n" ++ "tst %[w], #2\n" ++ "beq 1f\n" ++ "vld1.32 {d2}, [%[src]]!\n" ++ "vld1.16 {d5[0]}, [%[dst_r]]!\n" ++ "vld1.16 {d5[1]}, [%[dst_r]]!\n" ++ "1:\n" ++ "tst %[w], #1\n" ++ "beq 1f\n" ++ "vld1.32 {d3[0]}, [%[src]]!\n" ++ "vld1.16 {d5[2]}, [%[dst_r]]!\n" ++ "1:\n" ++ ++ "process_pixblock_head -1\n" ++ "process_pixblock_tail -1\n" ++ ++ "tst %[w], #4\n" ++ "beq 1f\n" ++ "vst1.16 {d6}, [%[dst_w]]!\n" ++ "1:\n" ++ "tst %[w], #2\n" ++ "beq 1f\n" ++ "vst1.16 {d7[0]}, [%[dst_w]]!\n" ++ "vst1.16 {d7[1]}, [%[dst_w]]!\n" ++ "1:\n" ++ "tst %[w], #1\n" ++ "beq 2f\n" ++ "vst1.16 {d7[2]}, [%[dst_w]]!\n" ++ "2:\n" ++ ++ "add %[src], %[src], %[src_newline_delta]\n" ++ "add %[dst_r], %[dst_r], %[dst_newline_delta]\n" ++ "add %[dst_w], %[dst_w], %[dst_newline_delta]\n" ++ "mov %[w], %[orig_w]\n" ++ "subs %[h], %[h], #1\n" ++ "bge 0b\n" ++ "9:\n" ++ ".purgem process_pixblock_head\n" ++ ".purgem process_pixblock_tail\n" ++ ".purgem process_pixblock_tail_head_8\n" ++ ++ : [src] "+&r" (src), [dst_r] "+&r" (dst), [dst_w] "+&r" (dst), ++ [w] "+&r" (w), [h] "+&r" (h) ++ : [dst_newline_delta] "r" (dst_newline_delta), ++ [src_newline_delta] "r" (src_newline_delta), [orig_w] "r" (w) ++ : "cc", "memory", ++ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", ++ /* "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", */ ++ "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", ++ "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" ++ ); ++} ++ ++static void ++neon_composite_over_8888_0565 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint16_t *dst_line; ++ uint32_t *src_line; ++ int32_t dst_stride, src_stride; ++ ++ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); ++ ++ neon_composite_over_8888_0565_internal (src_line, ++ dst_line, ++ width, ++ height, ++ src_stride, ++ dst_stride); ++} ++ + #endif /* USE_GCC_INLINE_ASM */ + + static void +@@ -1908,6 +2176,8 @@ static const pixman_fast_path_t arm_neon_fast_path_array[] = + #ifdef USE_GCC_INLINE_ASM + { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_16_16, 0 }, + { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_16_16, 0 }, ++ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_8888_0565, 0 }, ++ { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_8888_0565, 0 }, + #endif + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_over_8888_8888, 0 }, -- cgit 1.2.3-korg