diff options
Diffstat (limited to 'recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch')
-rw-r--r-- | recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch | 226 |
1 files changed, 226 insertions, 0 deletions
diff --git a/recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch b/recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch new file mode 100644 index 0000000000..245e536716 --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch @@ -0,0 +1,226 @@ +From 11a0c5badbc59ce967707ef836313cc98f8aec4e Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Wed, 9 Mar 2011 11:46:48 +0200 +Subject: [PATCH 18/40] ARM: use common macro template for bilinear scaled 'src_8888_8888' + +This is a cleanup for old and now duplicated code. The performance improvement +is mostly coming from the enabled use of software prefetch, but instructions +scheduling is also slightly better. + +Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=53.24 MPix/s + after: op=1, src=20028888, dst=20028888, speed=74.36 MPix/s +--- + pixman/pixman-arm-neon-asm.S | 191 +----------------------------------------- + 1 files changed, 3 insertions(+), 188 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index f3784f5..52dc444 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2405,194 +2405,6 @@ generate_composite_function_nearest_scanline \ + fname: + .endm + +-.macro bilinear_interpolate_last_pixel +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d0}, [TMP1] +- vshr.u16 d30, d24, #8 +- vld1.32 {d1}, [TMP2] +- vmull.u8 q1, d0, d28 +- vmlal.u8 q1, d1, d29 +- /* 5 cycles bubble */ +- vshll.u16 q0, d2, #8 +- vmlsl.u16 q0, d2, d30 +- vmlal.u16 q0, d3, d30 +- /* 5 cycles bubble */ +- vshrn.u32 d0, q0, #16 +- /* 3 cycles bubble */ +- vmovn.u16 d0, q0 +- /* 1 cycle bubble */ +- vst1.32 {d0[0]}, [OUT, :32]! +-.endm +- +-.macro bilinear_interpolate_two_pixels +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add X, X, UX +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d0}, [TMP1] +- vld1.32 {d1}, [TMP2] +- vmull.u8 q1, d0, d28 +- vmlal.u8 q1, d1, d29 +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add X, X, UX +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d20}, [TMP1] +- vld1.32 {d21}, [TMP2] +- vmull.u8 q11, d20, d28 +- vmlal.u8 q11, d21, d29 +- vshr.u16 q15, q12, #8 +- vadd.u16 q12, q12, q13 +- vshll.u16 q0, d2, #8 +- vmlsl.u16 q0, d2, d30 +- vmlal.u16 q0, d3, d30 +- vshll.u16 q10, d22, #8 +- vmlsl.u16 q10, d22, d31 +- vmlal.u16 q10, d23, d31 +- vshrn.u32 d30, q0, #16 +- vshrn.u32 d31, q10, #16 +- vmovn.u16 d0, q15 +- vst1.32 {d0}, [OUT]! +-.endm +- +-.macro bilinear_interpolate_four_pixels +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add X, X, UX +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d0}, [TMP1] +- vld1.32 {d1}, [TMP2] +- vmull.u8 q1, d0, d28 +- vmlal.u8 q1, d1, d29 +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add X, X, UX +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d20}, [TMP1] +- vld1.32 {d21}, [TMP2] +- vmull.u8 q11, d20, d28 +- vmlal.u8 q11, d21, d29 +- vshr.u16 q15, q12, #8 +- vadd.u16 q12, q12, q13 +- vshll.u16 q0, d2, #8 +- vmlsl.u16 q0, d2, d30 +- vmlal.u16 q0, d3, d30 +- vshll.u16 q10, d22, #8 +- vmlsl.u16 q10, d22, d31 +- vmlal.u16 q10, d23, d31 +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add X, X, UX +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d4}, [TMP1] +- vld1.32 {d5}, [TMP2] +- vmull.u8 q3, d4, d28 +- vmlal.u8 q3, d5, d29 +- mov TMP1, X, asr #16 +- mov TMP2, X, asr #16 +- add X, X, UX +- add TMP1, TOP, TMP1, asl #2 +- add TMP2, BOTTOM, TMP2, asl #2 +- vld1.32 {d16}, [TMP1] +- vld1.32 {d17}, [TMP2] +- vmull.u8 q9, d16, d28 +- vmlal.u8 q9, d17, d29 +- vshr.u16 q15, q12, #8 +- vadd.u16 q12, q12, q13 +- vshll.u16 q2, d6, #8 +- vmlsl.u16 q2, d6, d30 +- vmlal.u16 q2, d7, d30 +- vshll.u16 q8, d18, #8 +- vmlsl.u16 q8, d18, d31 +- vmlal.u16 q8, d19, d31 +- vshrn.u32 d0, q0, #16 +- vshrn.u32 d1, q10, #16 +- vshrn.u32 d4, q2, #16 +- vshrn.u32 d5, q8, #16 +- vmovn.u16 d0, q0 +- vmovn.u16 d1, q2 +- vst1.32 {d0, d1}, [OUT]! +-.endm +- +- +-/* +- * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t * out, +- * const uint32_t * top, +- * const uint32_t * bottom, +- * int wt, +- * int wb, +- * pixman_fixed_t x, +- * pixman_fixed_t ux, +- * int width) +- */ +- +-pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon +- OUT .req r0 +- TOP .req r1 +- BOTTOM .req r2 +- WT .req r3 +- WB .req r4 +- X .req r5 +- UX .req r6 +- WIDTH .req ip +- TMP1 .req r3 +- TMP2 .req r4 +- +- mov ip, sp +- push {r4, r5, r6, r7} +- ldmia ip, {WB, X, UX, WIDTH} +- +- cmp WIDTH, #0 +- ble 3f +- vdup.u16 q12, X +- vdup.u16 q13, UX +- vdup.u8 d28, WT +- vdup.u8 d29, WB +- vadd.u16 d25, d25, d26 +- vadd.u16 q13, q13, q13 +- +- subs WIDTH, WIDTH, #4 +- blt 1f +-0: +- bilinear_interpolate_four_pixels +- subs WIDTH, WIDTH, #4 +- bge 0b +-1: +- tst WIDTH, #2 +- beq 2f +- bilinear_interpolate_two_pixels +-2: +- tst WIDTH, #1 +- beq 3f +- bilinear_interpolate_last_pixel +-3: +- pop {r4, r5, r6, r7} +- bx lr +- +- .unreq OUT +- .unreq TOP +- .unreq BOTTOM +- .unreq WT +- .unreq WB +- .unreq X +- .unreq UX +- .unreq WIDTH +- .unreq TMP1 +- .unreq TMP2 +-.endfunc +- +-.purgem bilinear_interpolate_last_pixel +-.purgem bilinear_interpolate_two_pixels +-.purgem bilinear_interpolate_four_pixels +- + /* + * Bilinear scaling support code which tries to provide pixel fetching, color + * format conversion, and interpolation as separate macros which can be used +@@ -2810,3 +2622,6 @@ pixman_asm_function fname + .endfunc + + .endm ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28 +-- +1.6.6.1 + |