aboutsummaryrefslogtreecommitdiffstats
path: root/meta-oe/recipes-graphics/xorg-lib/pixman-0.21.4/0023-ARM-added-NEON-optimizations-for-fetch-store-a8-scan.patch
blob: 7724f5433e1211aeaa8e9a1e45e0c19afad2a39c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
From cc99d8d6fcbabd7f9f3ed99e65c78a2fb71792fa Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Thu, 23 Sep 2010 21:10:56 +0300
Subject: [PATCH 23/24] ARM: added NEON optimizations for fetch/store a8 scanline

---
 pixman/pixman-arm-neon-asm.S |   64 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-neon.c     |   42 +++++++++++++++++++++++++++
 2 files changed, 106 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 25f7bf0..439b06b 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -418,6 +418,70 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro pixman_composite_src_8_8888_process_pixblock_head
+    /* This is tricky part: we can't set these values just once in 'init' macro
+     * because leading/trailing pixels handling part uses VZIP.8 instructions,
+     * and they operate on values in-place and destroy original registers
+     * content. Think about it like VST4.8 instruction corrupting NEON
+     * registers after write in 'tail_head' macro. Except that 'tail_head'
+     * macro itself actually does not need these extra VMOVs because it uses
+     * real VST4.8 instruction.
+     */
+    vmov.u8     q0, #0
+    vmov.u8     d2, #0
+.endm
+
+.macro pixman_composite_src_8_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8_8888_process_pixblock_tail_head
+    vst4.8      {d0, d1, d2, d3}, [DST_W, :128]!
+    vld1.8      {d3}, [SRC]!
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_fetch_scanline_a8_asm_neon, 8, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8_8888_process_pixblock_head, \
+    pixman_composite_src_8_8888_process_pixblock_tail, \
+    pixman_composite_src_8_8888_process_pixblock_tail_head, \
+    0,  /* dst_w_basereg */ \
+    0,  /* dst_r_basereg */ \
+    3,  /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8_process_pixblock_tail_head
+    vst1.8      {d3}, [DST_W, :64]!
+    vld4.8      {d0, d1, d2, d3}, [SRC]!
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_store_scanline_a8_asm_neon, 32, 0, 8, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_8_process_pixblock_head, \
+    pixman_composite_src_8888_8_process_pixblock_tail, \
+    pixman_composite_src_8888_8_process_pixblock_tail_head, \
+    3,  /* dst_w_basereg */ \
+    0,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
 .macro pixman_composite_src_8888_0565_process_pixblock_head
     vshll.u8    q8, d1, #8
     vshll.u8    q14, d2, #8
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index f773e92..55219b3 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -484,6 +484,45 @@ neon_store_scanline_r5g6b5 (bits_image_t *  image,
     pixman_store_scanline_r5g6b5_asm_neon (width, pixel, values);
 }
 
+void
+pixman_fetch_scanline_a8_asm_neon (int             width,
+                                   uint32_t       *buffer,
+                                   const uint8_t  *pixel);
+
+
+void
+pixman_store_scanline_a8_asm_neon (int             width,
+                                   uint8_t        *pixel,
+                                   const uint32_t *values);
+
+static void
+neon_fetch_scanline_a8 (pixman_image_t *image,
+                        int             x,
+                        int             y,
+                        int             width,
+                        uint32_t *      buffer,
+                        const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint8_t *pixel = (const uint8_t *) bits + x;
+
+    pixman_fetch_scanline_a8_asm_neon (width, buffer, pixel);
+}
+
+static void
+neon_store_scanline_a8 (bits_image_t *  image,
+                        int             x,
+                        int             y,
+                        int             width,
+                        const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t *pixel = (uint8_t *) bits + x;
+
+    pixman_store_scanline_a8_asm_neon (width, pixel, values);
+}
+
+
 pixman_implementation_t *
 _pixman_implementation_create_arm_neon (void)
 {
@@ -502,6 +541,9 @@ _pixman_implementation_create_arm_neon (void)
     _pixman_bits_override_accessors (PIXMAN_r5g6b5,
                                      neon_fetch_scanline_r5g6b5,
                                      neon_store_scanline_r5g6b5);
+    _pixman_bits_override_accessors (PIXMAN_a8,
+                                     neon_fetch_scanline_a8,
+                                     neon_store_scanline_a8);
 
     imp->blt = arm_neon_blt;
     imp->fill = arm_neon_fill;
-- 
1.6.6.1