meta/recipes-graphics/xorg-driver/xf86-video-omapfb/omapfb-neon.diff


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

Upstream-Status: Pending

--- /tmp/image-format-conversions.h	2009-02-03 10:18:04.000000000 +0100
+++ git/src/image-format-conversions.h	2009-02-03 10:19:18.000000000 +0100
@@ -30,6 +30,8 @@
 /* Basic C implementation of YV12/I420 to UYVY conversion */
 void uv12_to_uyvy(int w, int h, int y_pitch, int uv_pitch, uint8_t *y_p, uint8_t *u_p, uint8_t *v_p, uint8_t *dest);
 
+/* NEON implementation of YV12/I420 to UYVY conversion */
+void uv12_to_uyvy_neon(int w, int h, int y_pitch, int uv_pitch, uint8_t *y_p, uint8_t *u_p, uint8_t *v_p, uint8_t *dest);
 
 #endif /* __IMAGE_FORMAT_CONVERSIONS_H__ */
 
--- /tmp/image-format-conversions.c	2009-02-03 10:18:04.000000000 +0100
+++ git/src/image-format-conversions.c	2009-02-03 10:16:47.000000000 +0100
@@ -2,6 +2,7 @@
  * Copyright 2008 Kalle Vahlman, <zuh@iki.fi>
  *                Ilpo Ruotsalainen, <lonewolf@iki.fi>
  *                Tuomas Kulve, <tuomas.kulve@movial.com>
+ *                Ian Rickards, <ian.rickards@arm.com>
  *                
  *
  * Permission to use, copy, modify, distribute and sell this software and its
@@ -89,3 +90,104 @@
 	}
 }
 
+void uv12_to_uyvy_neon(int w, int h, int y_pitch, int uv_pitch, uint8_t *y_p, uint8_t *u_p, uint8_t *v_p, uint8_t *dest)
+{
+    int x, y;
+    uint8_t *dest_even = dest;
+    uint8_t *dest_odd = dest + w * 2;
+    uint8_t *y_p_even = y_p;
+    uint8_t *y_p_odd = y_p + y_pitch;
+
+    /*ErrorF("in uv12_to_uyvy, w: %d, pitch: %d\n", w, pitch);*/
+    if (w<16)
+    {
+        for (y=0; y<h; y+=2)
+        {
+            for (x=0; x<w; x+=2)
+            {
+                /* Output two 2x1 macroblocks to form a 2x2 block from input */
+                uint8_t u_val = *u_p++;
+                uint8_t v_val = *v_p++;
+
+                /* Even row, first pixel */
+                *dest_even++ = u_val;
+                *dest_even++ = *y_p_even++;
+
+                /* Even row, second pixel */
+                *dest_even++ = v_val;
+                *dest_even++ = *y_p_even++;
+
+                /* Odd row, first pixel */
+                *dest_odd++ = u_val;
+                *dest_odd++ = *y_p_odd++;
+
+                /* Odd row, second pixel */
+                *dest_odd++ = v_val;
+                *dest_odd++ = *y_p_odd++;
+            }
+
+            dest_even += w * 2;
+            dest_odd += w * 2;
+
+            u_p += ((uv_pitch << 1) - w) >> 1;
+            v_p += ((uv_pitch << 1) - w) >> 1;
+
+            y_p_even += (y_pitch - w) + y_pitch;
+            y_p_odd += (y_pitch - w) + y_pitch;
+        }
+    }
+    else
+    {
+        for (y=0; y<h; y+=2)
+        {
+            x=w;
+            do {
+                // avoid using d8-d15 (q4-q7) aapcs callee-save registers
+                asm volatile (
+                        "1:\n\t"
+                        "vld1.u8   {d0}, [%[u_p]]!\n\t"
+                        "sub       %[x],%[x],#16\n\t"
+                        "cmp       %[x],#16\n\t"
+                        "vld1.u8   {d1}, [%[v_p]]!\n\t"
+                        "vld1.u8   {q1}, [%[y_p_even]]!\n\t"
+                        "vzip.u8   d0, d1\n\t"
+                        "vld1.u8   {q2}, [%[y_p_odd]]!\n\t"
+                // use 2-element struct stores to zip up y with y&v
+                        "vst2.u8   {q0,q1}, [%[dest_even]]!\n\t"
+                        "vmov.u8   q1, q2\n\t"
+                        "vst2.u8   {q0,q1}, [%[dest_odd]]!\n\t"
+                        "bhs       1b\n\t"
+                        : [u_p] "+r" (u_p), [v_p] "+r" (v_p), [y_p_even] "+r" (y_p_even), [y_p_odd] "+r" (y_p_odd),
+                          [dest_even] "+r" (dest_even), [dest_odd] "+r" (dest_odd),
+                          [x] "+r" (x)
+                        :
+                        : "cc", "memory", "d0","d1","d2","d3","d4","d5"
+                        );
+                if (x!=0)
+                {
+                    // overlap final 16-pixel block to process requested width exactly
+                    x = 16-x;
+                    u_p -= x/2;
+                    v_p -= x/2;
+                    y_p_even -= x;
+                    y_p_odd -= x;
+                    dest_even -= x*2;
+                    dest_odd -= x*2;
+                    x = 16;
+                    // do another 16-pixel block
+                }
+            }
+            while (x!=0);
+
+            dest_even += w * 2;
+            dest_odd += w * 2;
+
+            u_p += ((uv_pitch << 1) - w) >> 1;
+            v_p += ((uv_pitch << 1) - w) >> 1;
+
+            y_p_even += (y_pitch - w) + y_pitch;
+            y_p_odd += (y_pitch - w) + y_pitch;
+        }
+    }
+}
+
--- /tmp/omapfb-xv-generic.c	2009-02-03 10:52:18.000000000 +0100
+++ git/src/omapfb-xv-generic.c	2009-02-03 10:52:24.000000000 +0100
@@ -240,7 +240,7 @@
 			uint8_t *yb = buf;
 			uint8_t *ub = yb + (src_y_pitch * src_h);
 			uint8_t *vb = ub + (src_uv_pitch * (src_h / 2));
-			uv12_to_uyvy(src_w & ~15,
+			uv12_to_uyvy_neon(src_w & ~15,
 			             src_h & ~15,
 				     src_y_pitch,
 				     src_uv_pitch,
@@ -256,7 +256,7 @@
 			uint8_t *yb = buf;
 			uint8_t *vb = yb + (src_y_pitch * src_h);
 			uint8_t *ub = vb + (src_uv_pitch * (src_h / 2));
-			uv12_to_uyvy(src_w & ~15,
+			uv12_to_uyvy_neon(src_w & ~15,
 			             src_h & ~15,
 				     src_y_pitch,
 				     src_uv_pitch,