aboutsummaryrefslogtreecommitdiffstats
path: root/recipes/xorg-lib/pixman-0.21.2/0009-ARM-reuse-common-NEON-code-for-over_-n_8-8888_n-8888.patch
blob: b45671e98e03f35c655f2dcd6b46fe08f8d8ed38 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
From 3990931bf6197eff1cec06cf24bce53ddf9a539a Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Sat, 27 Nov 2010 04:47:39 +0200
Subject: [PATCH 09/24] ARM: reuse common NEON code for over_{n_8|8888_n|8888_8}_0565

Renamed suppementary macros from 'over_n_8_0565' to 'over_8888_8_0565',
because they can actually support all variants of this operation:
over_8888_8_0565/over_n_8_0565/over_8888_n_0565.

Also 'over_8888_8_0565' now uses more optimized common code instead of its
own variant, improving performance a bit. Even though this operation is
still memory bandwidth limited, scaled variants of these fast paths may
put more stress on CPU later.

Benchmarked on ARM Cortex-A8 @500MHz:

== before ==

    over_8888_8_0565 =  L1:  67.10  L2:  53.82  M: 44.70 (105.17%)
                        HT:  18.73  VT:  16.91  R: 14.25  RT:  4.80 (52Kops/s)

== after ==

    over_8888_8_0565 =  L1:  77.83  L2:  58.14  M: 44.82 (105.52%)
                        HT:  20.58  VT:  17.44  R: 15.05  RT:  4.88 (52Kops/s)
---
 pixman/pixman-arm-neon-asm.S |   61 +++++++++++++++++------------------------
 1 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 3e52a49..4175144 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -791,7 +791,7 @@ generate_composite_function \
 
 /******************************************************************************/
 
-.macro pixman_composite_over_n_8_0565_process_pixblock_head
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
     vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
     vmull.u8    q1,  d24, d9
     vmull.u8    q6,  d24, d10
@@ -816,7 +816,7 @@ generate_composite_function \
     vmull.u8    q10, d3, d30
 .endm
 
-.macro pixman_composite_over_n_8_0565_process_pixblock_tail
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
     /* 3 cycle bubble (after vmull.u8) */
     vrshr.u16   q13, q8,  #8
     vrshr.u16   q11, q9,  #8
@@ -835,7 +835,7 @@ generate_composite_function \
     vsri.u16    q14, q9,  #11
 .endm
 
-.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
     vld1.16     {d4, d5}, [DST_R, :128]!
     vshrn.u16   d6,  q2,  #8
     fetch_mask_pixblock
@@ -880,6 +880,23 @@ generate_composite_function \
     vmull.u8    q10, d3,  d30
 .endm
 
+generate_composite_function \
+    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
 /*
  * This function needs a special initialization of solid mask.
  * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
@@ -911,9 +928,9 @@ generate_composite_function \
     5, /* prefetch distance */ \
     pixman_composite_over_n_8_0565_init, \
     pixman_composite_over_n_8_0565_cleanup, \
-    pixman_composite_over_n_8_0565_process_pixblock_head, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail_head
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
 
 /******************************************************************************/
 
@@ -935,36 +952,8 @@ generate_composite_function \
     5, /* prefetch distance */ \
     pixman_composite_over_8888_n_0565_init, \
     pixman_composite_over_8888_n_0565_cleanup, \
-    pixman_composite_over_n_8_0565_process_pixblock_head, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
-    vld1.16     {d4, d5}, [DST_R, :128]!
-    pixman_composite_over_n_8_0565_process_pixblock_tail
-    fetch_src_pixblock
-    cache_preload 8, 8
-    fetch_mask_pixblock
-    pixman_composite_over_n_8_0565_process_pixblock_head
-    vst1.16     {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_n_8_0565_process_pixblock_head, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
     pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
     28, /* dst_w_basereg */ \
     4,  /* dst_r_basereg */ \
-- 
1.6.6.1