1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
|
From cd20ceb7602348ecbfa0db1756dc548a0bad3c9d Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Thu, 17 Mar 2011 19:42:01 +0200
Subject: [PATCH 34/40] ARM: support different levels of loop unrolling in bilinear scaler
Now an extra 'flag' parameter is supported in bilinear scaline scaling
function generation macro. It can be used to enable 4 or 8 pixels per
loop iteration unrolling and provide save/restore code for d8-d15
registers.
---
pixman/pixman-arm-neon-asm.S | 84 ++++++++++++++++++++++++++++++++++++++----
1 files changed, 76 insertions(+), 8 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 9878bf7..6141770 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2633,6 +2633,36 @@ fname:
.endif
.endm
+.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+ bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
+.else
+ bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+ bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
+.else
+ bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+ bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.set BILINEAR_FLAG_UNROLL_4, 0
+.set BILINEAR_FLAG_UNROLL_8, 1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
/*
* Main template macro for generating NEON optimized bilinear scanline
* functions.
@@ -2648,7 +2678,7 @@ fname:
.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
src_bpp_shift, dst_bpp_shift, \
- prefetch_distance
+ prefetch_distance, flags
pixman_asm_function fname
OUT .req r0
@@ -2672,6 +2702,10 @@ pixman_asm_function fname
ldmia ip, {WB, X, UX, WIDTH}
mul PF_OFFS, PF_OFFS, UX
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpush {d8-d15}
+.endif
+
sub STRIDE, BOTTOM, TOP
.unreq BOTTOM
@@ -2705,8 +2739,34 @@ pixman_asm_function fname
bilinear_interpolate_two_pixels src_fmt, dst_fmt
sub WIDTH, WIDTH, #2
0:
-
- /* start the main loop */
+.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
+/*********** 8 pixels per iteration *****************/
+ cmp WIDTH, #4
+ blt 0f
+ tst OUT, #(1 << (dst_bpp_shift + 2))
+ beq 0f
+ bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ sub WIDTH, WIDTH, #4
+0:
+ subs WIDTH, WIDTH, #8
+ blt 1f
+ mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+ bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+ subs WIDTH, WIDTH, #8
+ blt 5f
+0:
+ bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+ subs WIDTH, WIDTH, #8
+ bge 0b
+5:
+ bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+1:
+ tst WIDTH, #4
+ beq 2f
+ bilinear_interpolate_four_pixels src_fmt, dst_fmt
+2:
+.else
+/*********** 4 pixels per iteration *****************/
subs WIDTH, WIDTH, #4
blt 1f
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
@@ -2720,7 +2780,8 @@ pixman_asm_function fname
5:
bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
1:
-
+/****************************************************/
+.endif
/* handle the remaining trailing pixels */
tst WIDTH, #2
beq 2f
@@ -2730,6 +2791,9 @@ pixman_asm_function fname
beq 3f
bilinear_interpolate_last_pixel src_fmt, dst_fmt
3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpop {d8-d15}
+.endif
pop {r4, r5, r6, r7, r8, r9}
bx lr
@@ -2751,13 +2815,17 @@ pixman_asm_function fname
.endm
generate_bilinear_scanline_func \
- pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 2, 28
+ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
+ 2, 2, 28, BILINEAR_FLAG_UNROLL_4
generate_bilinear_scanline_func \
- pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 1, 28
+ pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
+ 2, 1, 28, BILINEAR_FLAG_UNROLL_4
generate_bilinear_scanline_func \
- pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 2, 28
+ pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
+ 1, 2, 28, BILINEAR_FLAG_UNROLL_4
generate_bilinear_scanline_func \
- pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 1, 28
+ pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
+ 1, 1, 28, BILINEAR_FLAG_UNROLL_4
--
1.6.6.1
|