aboutsummaryrefslogtreecommitdiffstats
path: root/recipes/xorg-lib/pixman-0.21.6/0022-ARM-a-bit-faster-NEON-bilinear-scaling-for-r5g6b5-so.patch
blob: 20019f45f1a31e351aeae29ab3b5dbbfed20536a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
From 70a923882ca24664344ba91a649e7aa12c3063f7 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Wed, 9 Mar 2011 13:55:48 +0200
Subject: [PATCH 22/40] ARM: a bit faster NEON bilinear scaling for r5g6b5 source images

Instructions scheduling improved in the code responsible for fetching r5g6b5
pixels and converting them to the intermediate x8r8g8b8 color format used in
the interpolation part of code. Still a lot of NEON stalls are remaining,
which can be resolved later by the use of pipelining.

Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
 Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
  before: op=1, src=10020565, dst=10020565, speed=32.29 MPix/s
          op=1, src=10020565, dst=20020888, speed=36.82 MPix/s
  after:  op=1, src=10020565, dst=10020565, speed=41.35 MPix/s
          op=1, src=10020565, dst=20020888, speed=49.16 MPix/s
---
 pixman/pixman-arm-neon-asm.S |  118 +++++++++++++++++++++++++++++++++++------
 1 files changed, 100 insertions(+), 18 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 2b6875b..71b30ac 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2430,6 +2430,101 @@ fname:
     convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
 .endm
 
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+    bilinear_load_8888 reg1, reg2, tmp1
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    bilinear_load_8888 reg3, reg4, tmp2
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP2, asl #1
+    add       TMP2, BOTTOM, TMP2, asl #1
+    add       TMP3, TOP, TMP4, asl #1
+    add       TMP4, BOTTOM, TMP4, asl #1
+    vld1.32   {acc2lo[0]}, [TMP1]
+    vld1.32   {acc2hi[0]}, [TMP3]
+    vld1.32   {acc2lo[1]}, [TMP2]
+    vld1.32   {acc2hi[1]}, [TMP4]
+    convert_0565_to_x888 acc2, reg3, reg2, reg1
+    vzip.u8   reg1, reg3
+    vzip.u8   reg2, reg4
+    vzip.u8   reg3, reg4
+    vzip.u8   reg1, reg2
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP2, asl #1
+    add       TMP2, BOTTOM, TMP2, asl #1
+    add       TMP3, TOP, TMP4, asl #1
+    add       TMP4, BOTTOM, TMP4, asl #1
+    vld1.32   {xacc2lo[0]}, [TMP1]
+    vld1.32   {xacc2hi[0]}, [TMP3]
+    vld1.32   {xacc2lo[1]}, [TMP2]
+    vld1.32   {xacc2hi[1]}, [TMP4]
+    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP2, asl #1
+    add       TMP2, BOTTOM, TMP2, asl #1
+    add       TMP3, TOP, TMP4, asl #1
+    add       TMP4, BOTTOM, TMP4, asl #1
+    vld1.32   {yacc2lo[0]}, [TMP1]
+    vzip.u8   xreg1, xreg3
+    vld1.32   {yacc2hi[0]}, [TMP3]
+    vzip.u8   xreg2, xreg4
+    vld1.32   {yacc2lo[1]}, [TMP2]
+    vzip.u8   xreg3, xreg4
+    vld1.32   {yacc2hi[1]}, [TMP4]
+    vzip.u8   xreg1, xreg2
+    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+    vmull.u8  xacc1, xreg1, d28
+    vzip.u8   yreg1, yreg3
+    vmlal.u8  xacc1, xreg2, d29
+    vzip.u8   yreg2, yreg4
+    vmull.u8  xacc2, xreg3, d28
+    vzip.u8   yreg3, yreg4
+    vmlal.u8  xacc2, xreg4, d29
+    vzip.u8   yreg1, yreg2
+    vmull.u8  yacc1, yreg1, d28
+    vmlal.u8  yacc1, yreg2, d29
+    vmull.u8  yacc2, yreg3, d28
+    vmlal.u8  yacc2, yreg4, d29
+.endm
+
 .macro bilinear_store_8888 numpix, tmp1, tmp2
 .if numpix == 4
     vst1.32   {d0, d1}, [OUT]!
@@ -2477,12 +2572,8 @@ fname:
 .endm
 
 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
-    bilinear_load_&src_fmt d0, d1, d2
-    vmull.u8  q1, d0, d28
-    vmlal.u8  q1, d1, d29
-    bilinear_load_&src_fmt d20, d21, d22
-    vmull.u8  q11, d20, d28
-    vmlal.u8  q11, d21, d29
+    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23
     vshr.u16  q15, q12, #8
     vadd.u16  q12, q12, q13
     vshll.u16 q0, d2, #8
@@ -2498,18 +2589,9 @@ fname:
 .endm
 
 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
-    bilinear_load_&src_fmt d0, d1, d2
-    vmull.u8  q1, d0, d28
-    vmlal.u8  q1, d1, d29
-    bilinear_load_&src_fmt d20, d21, d22
-    vmull.u8  q11, d20, d28
-    vmlal.u8  q11, d21, d29
-    bilinear_load_&src_fmt d4, d5, d6
-    vmull.u8  q3, d4, d28
-    vmlal.u8  q3, d5, d29
-    bilinear_load_&src_fmt d16, d17, d18
-    vmull.u8  q9, d16, d28
-    vmlal.u8  q9, d17, d29
+    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23 \
+                q3, q9,  d4, d5, d16, d17, d18, d19
     pld       [TMP1, PF_OFFS]
     vshr.u16  q15, q12, #8
     vadd.u16  q12, q12, q13
-- 
1.6.6.1