aboutsummaryrefslogtreecommitdiffstats
path: root/recipes-graphics/xorg-lib/pixman-0.20.0/0008-ARM-optimization-for-scaled-src_0565_0565-operation-.patch
blob: 6efdb621ad75b8e39d92af9462088f4524ff07c5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
From e1191ad6563a1fb02a45982b1c4d7fed3c655e97 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 4 Oct 2010 01:56:59 +0300
Subject: [PATCH 8/8] ARM optimization for scaled src_0565_0565 operation with nearest filter

The code actually uses only armv4t instructions.

Benchmark from ARM11:

    == before ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=34.86 MPix/s

    == after ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=36.62 MPix/s

Benchmark from ARM Cortex-A8:

    == before ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=89.55 MPix/s

    == after ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=94.91 MPix/s
---
 pixman/pixman-arm-simd-asm.S |   66 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c     |   37 +++++++++++++++++++++++
 2 files changed, 103 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index a3d2d40..b6f69db 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -1,5 +1,6 @@
 /*
  * Copyright © 2008 Mozilla Corporation
+ * Copyright © 2010 Nokia Corporation
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
@@ -328,3 +329,68 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
 	bx	lr
 .endfunc
+
+/*
+ * Note: This function is actually primarily optimized for ARM Cortex-A8
+ * pipeline. In order to get good performance on ARM9/ARM11 cores (which
+ * don't have efficient write combining), it needs to be changed to use
+ * 16-byte aligned writes using STM instruction.
+ */
+pixman_asm_function pixman_scaled_nearest_scanline_565_565_SRC_asm_armv6
+	DST	.req	r0
+	SRC	.req	r1
+	W	.req	r2
+	VX	.req	r3
+	UNIT_X	.req	r12
+	TMP1	.req	r4
+	TMP2	.req	r5
+	MASK	.req	r6
+	ldr	UNIT_X, [sp]
+	push	{r4, r5, r6, r7}
+	mvn	MASK, #1
+
+	/* define helper macro */
+	.macro	scale_2_pixels
+		ldrh	TMP1, [SRC, TMP1]
+		and	TMP2, MASK, VX, lsr #15
+		add	VX, VX, UNIT_X
+		strh	TMP1, [DST], #2
+
+		ldrh	TMP2, [SRC, TMP2]
+		and	TMP1, MASK, VX, lsr #15
+		add	VX, VX, UNIT_X
+		strh	TMP2, [DST], #2
+	.endm
+
+	/* now do the scaling */
+	and	TMP1, MASK, VX, lsr #15
+	add	VX, VX, UNIT_X
+	subs	W, #4
+	blt	2f
+1: /* main loop, process 4 pixels per iteration */
+	scale_2_pixels
+	scale_2_pixels
+	subs	W, W, #4
+	bge	1b
+2:
+	tst	W, #2
+	beq	2f
+	scale_2_pixels
+2:
+	tst	W, #1
+	ldrneh	TMP1, [SRC, TMP1]
+	strneh	TMP1, [DST], #2
+	/* cleanup helper macro */
+	.purgem	scale_2_pixels
+	.unreq	DST
+	.unreq	SRC
+	.unreq	W
+	.unreq	VX
+	.unreq	UNIT_X
+	.unreq	TMP1
+	.unreq	TMP2
+	.unreq	MASK
+	/* return */
+	pop	{r4, r5, r6, r7}
+	bx	lr
+.endfunc
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index d466a31..f6f464c 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -29,6 +29,7 @@
 
 #include "pixman-private.h"
 #include "pixman-arm-common.h"
+#include "pixman-fast-path.h"
 
 #if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
 
@@ -375,6 +376,35 @@ pixman_composite_over_n_8_8888_asm_armv6 (int32_t   width,
 
 #endif
 
+void
+pixman_scaled_nearest_scanline_565_565_SRC_asm_armv6 (uint16_t *      dst,
+						      uint16_t *      src,
+						      int32_t         w,
+						      pixman_fixed_t  vx,
+						      pixman_fixed_t  unit_x);
+
+static force_inline void
+scaled_nearest_scanline_armv6_565_565_SRC (uint16_t *      dst,
+					   uint16_t *      src,
+					   int32_t         w,
+					   pixman_fixed_t  vx,
+					   pixman_fixed_t  unit_x,
+					   pixman_fixed_t  max_vx)
+{
+    pixman_scaled_nearest_scanline_565_565_SRC_asm_armv6 (dst, src, w,
+							  vx, unit_x);
+}
+
+FAST_NEAREST_MAINLOOP (armv6_565_565_cover_SRC,
+		       scaled_nearest_scanline_armv6_565_565_SRC,
+		       uint16_t, uint16_t, COVER);
+FAST_NEAREST_MAINLOOP (armv6_565_565_none_SRC,
+		       scaled_nearest_scanline_armv6_565_565_SRC,
+		       uint16_t, uint16_t, NONE);
+FAST_NEAREST_MAINLOOP (armv6_565_565_pad_SRC,
+		       scaled_nearest_scanline_armv6_565_565_SRC,
+		       uint16_t, uint16_t, PAD);
+
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
                                    uint8_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
@@ -404,6 +434,13 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
 
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, r5g6b5, armv6_565_565),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, b5g6r5, armv6_565_565),
+    SIMPLE_NEAREST_FAST_PATH_NONE (SRC, r5g6b5, r5g6b5, armv6_565_565),
+    SIMPLE_NEAREST_FAST_PATH_NONE (SRC, b5g6r5, b5g6r5, armv6_565_565),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, r5g6b5, armv6_565_565),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, b5g6r5, armv6_565_565),
+
     { PIXMAN_OP_NONE },
 };
 
-- 
1.6.6.1