1 files changed, 235 insertions, 0 deletions
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0002-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0002-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch
new file mode 100644
index 0000000000..4ec821240c
--- /dev/null
+++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0002-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch
@@ -0,0 +1,235 @@
+From ce2fd2ac6aab2c14916d332ade47d72b06d504c1 Mon Sep 17 00:00:00 2001
+From: Taekyun Kim <tkq.kim@samsung.com>
+Date: Tue, 20 Sep 2011 21:32:35 +0900
+Subject: [PATCH 2/8] ARM: NEON: Bilinear macro template for instruction scheduling
+
+This macro template takes 6 code blocks.
+
+1. process_last_pixel
+2. process_two_pixels
+3. process_four_pixels
+4. process_pixblock_head
+5. process_pixblock_tail
+6. process_pixblock_tail_head
+
+process_last_pixel does not need to update horizontal weight. This
+is done by the template. two and four code block should update
+horizontal weight inside of them. head/tail/tail_head blocks
+consist unrolled core loop. You can apply instruction scheduling
+to the tail_head blocks.
+
+You can also specify size of the pixel block. Supported size is 4
+and 8. If you want to use mask, give BILINEAR_FLAG_USE_MASK flags
+to the template, then you can use register MASK. When using d8~d15
+registers, give BILINEAR_FLAG_USE_ALL_NEON_REGS to make sure
+registers are properly saved on the stack and later restored.
+---
+ pixman/pixman-arm-neon-asm-bilinear.S |  195 +++++++++++++++++++++++++++++++++
+ 1 files changed, 195 insertions(+), 0 deletions(-)
+
+diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
+index c5ba929..784e5df 100644
+--- a/pixman/pixman-arm-neon-asm-bilinear.S
++++ b/pixman/pixman-arm-neon-asm-bilinear.S
+@@ -773,3 +773,198 @@ generate_bilinear_scanline_func_src_a8_dst \
+ generate_bilinear_scanline_func_src_a8_dst \
+     pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
+     8888, 8888, add, 2, 28
++
++.set BILINEAR_FLAG_USE_MASK,		1
++.set BILINEAR_FLAG_USE_ALL_NEON_REGS,	2
++
++/*
++ * Main template macro for generating NEON optimized bilinear scanline functions.
++ *
++ * Bilinear scanline generator macro take folling arguments:
++ *  fname			- name of the function to generate
++ *  src_fmt			- source color format (8888 or 0565)
++ *  dst_fmt			- destination color format (8888 or 0565)
++ *  src/dst_bpp_shift		- (1 << bpp_shift) is the size of src/dst pixel in bytes
++ *  process_last_pixel		- code block that interpolate one pixel and does not
++ *				  update horizontal weight
++ *  process_two_pixels		- code block that interpolate two pixels and update
++ *				  horizontal weight
++ *  process_four_pixels		- code block that interpolate four pixels and update
++ *				  horizontal weight
++ *  process_pixblock_head	- head part of middle loop
++ *  process_pixblock_tail	- tail part of middle loop
++ *  process_pixblock_tail_head	- tail_head of middle loop
++ *  pixblock_size		- number of pixels processed in a single middle loop
++ *  prefetch_distance		- prefetch in the source image by that many pixels ahead
++ */
++
++.macro generate_bilinear_scanline_func \
++	fname, \
++	src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
++	bilinear_process_last_pixel, \
++	bilinear_process_two_pixels, \
++	bilinear_process_four_pixels, \
++	bilinear_process_pixblock_head, \
++	bilinear_process_pixblock_tail, \
++	bilinear_process_pixblock_tail_head, \
++	pixblock_size, \
++	prefetch_distance, \
++	flags
++
++pixman_asm_function fname
++.if pixblock_size == 8
++.elseif pixblock_size == 4
++.else
++    .error unsupported pixblock size
++.endif
++
++.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
++    OUT       .req    r0
++    TOP       .req    r1
++    BOTTOM    .req    r2
++    WT        .req    r3
++    WB        .req    r4
++    X         .req    r5
++    UX        .req    r6
++    WIDTH     .req    ip
++    TMP1      .req    r3
++    TMP2      .req    r4
++    PF_OFFS   .req    r7
++    TMP3      .req    r8
++    TMP4      .req    r9
++    STRIDE    .req    r2
++
++    mov		ip, sp
++    push	{r4, r5, r6, r7, r8, r9}
++    mov		PF_OFFS, #prefetch_distance
++    ldmia	ip, {WB, X, UX, WIDTH}
++.else
++    OUT       .req      r0
++    MASK      .req      r1
++    TOP       .req      r2
++    BOTTOM    .req      r3
++    WT        .req      r4
++    WB        .req      r5
++    X         .req      r6
++    UX        .req      r7
++    WIDTH     .req      ip
++    TMP1      .req      r4
++    TMP2      .req      r5
++    PF_OFFS   .req      r8
++    TMP3      .req      r9
++    TMP4      .req      r10
++    STRIDE    .req      r3
++
++    mov       ip, sp
++    push      {r4, r5, r6, r7, r8, r9, r10, ip}
++    mov       PF_OFFS, #prefetch_distance
++    ldmia     ip, {WT, WB, X, UX, WIDTH}
++.endif
++
++    mul       PF_OFFS, PF_OFFS, UX
++
++.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
++    vpush     {d8-d15}
++.endif
++
++    sub	      STRIDE, BOTTOM, TOP
++    .unreq    BOTTOM
++
++    cmp       WIDTH, #0
++    ble       3f
++
++    vdup.u16  q12, X
++    vdup.u16  q13, UX
++    vdup.u8   d28, WT
++    vdup.u8   d29, WB
++    vadd.u16  d25, d25, d26
++
++    /* ensure good destination alignment  */
++    cmp       WIDTH, #1
++    blt       0f
++    tst       OUT, #(1 << dst_bpp_shift)
++    beq       0f
++    vshr.u16  q15, q12, #8
++    vadd.u16  q12, q12, q13
++    bilinear_process_last_pixel
++    sub       WIDTH, WIDTH, #1
++0:
++    vadd.u16  q13, q13, q13
++    vshr.u16  q15, q12, #8
++    vadd.u16  q12, q12, q13
++
++    cmp       WIDTH, #2
++    blt       0f
++    tst       OUT, #(1 << (dst_bpp_shift + 1))
++    beq       0f
++    bilinear_process_two_pixels
++    sub       WIDTH, WIDTH, #2
++0:
++.if pixblock_size == 8
++    cmp       WIDTH, #4
++    blt       0f
++    tst       OUT, #(1 << (dst_bpp_shift + 2))
++    beq       0f
++    bilinear_process_four_pixels
++    sub       WIDTH, WIDTH, #4
++0:
++.endif
++    subs      WIDTH, WIDTH, #pixblock_size
++    blt       1f
++    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
++    bilinear_process_pixblock_head
++    subs      WIDTH, WIDTH, #pixblock_size
++    blt       5f
++0:
++    bilinear_process_pixblock_tail_head
++    subs      WIDTH, WIDTH, #pixblock_size
++    bge       0b
++5:
++    bilinear_process_pixblock_tail
++1:
++.if pixblock_size == 8
++    tst       WIDTH, #4
++    beq       2f
++    bilinear_process_four_pixels
++2:
++.endif
++    /* handle the remaining trailing pixels */
++    tst       WIDTH, #2
++    beq       2f
++    bilinear_process_two_pixels
++2:
++    tst       WIDTH, #1
++    beq       3f
++    bilinear_process_last_pixel
++3:
++.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
++    vpop      {d8-d15}
++.endif
++
++.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
++    pop       {r4, r5, r6, r7, r8, r9}
++.else
++    pop       {r4, r5, r6, r7, r8, r9, r10, ip}
++.endif
++    bx        lr
++
++    .unreq    OUT
++    .unreq    TOP
++    .unreq    WT
++    .unreq    WB
++    .unreq    X
++    .unreq    UX
++    .unreq    WIDTH
++    .unreq    TMP1
++    .unreq    TMP2
++    .unreq    PF_OFFS
++    .unreq    TMP3
++    .unreq    TMP4
++    .unreq    STRIDE
++.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
++    .unreq    MASK
++.endif
++
++.endfunc
++
++.endm
+-- 
+1.6.6.1
+