aboutsummaryrefslogtreecommitdiffstats
path: root/recipes/xorg-lib/pixman-0.21.6/0033-ARM-use-less-ARM-instructions-in-NEON-bilinear-scali.patch
blob: 1d66979f99462b51b6a9b69283e385ff20e48492 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
From ec2da8e651767421a8403bf0810445fdec1315ba Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 21 Mar 2011 18:41:53 +0200
Subject: [PATCH 33/40] ARM: use less ARM instructions in NEON bilinear scaling code

This reduces code size and also puts less pressure on the
instruction decoder.
---
 pixman/pixman-arm-neon-asm.S |   79 ++++++++++++++++++++----------------------
 1 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index d84f2cc..9878bf7 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2412,21 +2412,19 @@ fname:
  */
 
 .macro bilinear_load_8888 reg1, reg2, tmp
-    mov       TMP2, X, asr #16
+    mov       TMP1, X, asr #16
     add       X, X, UX
-    add       TMP1, TOP, TMP2, asl #2
-    add       TMP2, BOTTOM, TMP2, asl #2
-    vld1.32   {reg1}, [TMP1]
-    vld1.32   {reg2}, [TMP2]
+    add       TMP1, TOP, TMP1, asl #2
+    vld1.32   {reg1}, [TMP1], STRIDE
+    vld1.32   {reg2}, [TMP1]
 .endm
 
 .macro bilinear_load_0565 reg1, reg2, tmp
-    mov       TMP2, X, asr #16
+    mov       TMP1, X, asr #16
     add       X, X, UX
-    add       TMP1, TOP, TMP2, asl #1
-    add       TMP2, BOTTOM, TMP2, asl #1
-    vld1.32   {reg2[0]}, [TMP1]
-    vld1.32   {reg2[1]}, [TMP2]
+    add       TMP1, TOP, TMP1, asl #1
+    vld1.32   {reg2[0]}, [TMP1], STRIDE
+    vld1.32   {reg2[1]}, [TMP1]
     convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
 .endm
 
@@ -2454,18 +2452,16 @@ fname:
 .macro bilinear_load_and_vertical_interpolate_two_0565 \
                 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
 
-    mov       TMP2, X, asr #16
+    mov       TMP1, X, asr #16
     add       X, X, UX
-    mov       TMP4, X, asr #16
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
     add       X, X, UX
-    add       TMP1, TOP, TMP2, asl #1
-    add       TMP2, BOTTOM, TMP2, asl #1
-    add       TMP3, TOP, TMP4, asl #1
-    add       TMP4, BOTTOM, TMP4, asl #1
-    vld1.32   {acc2lo[0]}, [TMP1]
-    vld1.32   {acc2hi[0]}, [TMP3]
-    vld1.32   {acc2lo[1]}, [TMP2]
-    vld1.32   {acc2hi[1]}, [TMP4]
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {acc2lo[1]}, [TMP1]
+    vld1.32   {acc2hi[1]}, [TMP2]
     convert_0565_to_x888 acc2, reg3, reg2, reg1
     vzip.u8   reg1, reg3
     vzip.u8   reg2, reg4
@@ -2481,34 +2477,30 @@ fname:
                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
 
-    mov       TMP2, X, asr #16
+    mov       TMP1, X, asr #16
     add       X, X, UX
-    mov       TMP4, X, asr #16
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
     add       X, X, UX
-    add       TMP1, TOP, TMP2, asl #1
-    add       TMP2, BOTTOM, TMP2, asl #1
-    add       TMP3, TOP, TMP4, asl #1
-    add       TMP4, BOTTOM, TMP4, asl #1
-    vld1.32   {xacc2lo[0]}, [TMP1]
-    vld1.32   {xacc2hi[0]}, [TMP3]
-    vld1.32   {xacc2lo[1]}, [TMP2]
-    vld1.32   {xacc2hi[1]}, [TMP4]
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {xacc2lo[1]}, [TMP1]
+    vld1.32   {xacc2hi[1]}, [TMP2]
     convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
-    mov       TMP2, X, asr #16
+    mov       TMP1, X, asr #16
     add       X, X, UX
-    mov       TMP4, X, asr #16
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
     add       X, X, UX
-    add       TMP1, TOP, TMP2, asl #1
-    add       TMP2, BOTTOM, TMP2, asl #1
-    add       TMP3, TOP, TMP4, asl #1
-    add       TMP4, BOTTOM, TMP4, asl #1
-    vld1.32   {yacc2lo[0]}, [TMP1]
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
     vzip.u8   xreg1, xreg3
-    vld1.32   {yacc2hi[0]}, [TMP3]
+    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
     vzip.u8   xreg2, xreg4
-    vld1.32   {yacc2lo[1]}, [TMP2]
+    vld1.32   {yacc2lo[1]}, [TMP1]
     vzip.u8   xreg3, xreg4
-    vld1.32   {yacc2hi[1]}, [TMP4]
+    vld1.32   {yacc2hi[1]}, [TMP2]
     vzip.u8   xreg1, xreg2
     convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
     vmull.u8  xacc1, xreg1, d28
@@ -2592,6 +2584,7 @@ fname:
                 q1, q11, d0, d1, d20, d21, d22, d23 \
                 q3, q9,  d4, d5, d16, d17, d18, d19
     pld       [TMP1, PF_OFFS]
+    sub       TMP1, TMP1, STRIDE
     vshll.u16 q0, d2, #8
     vmlsl.u16 q0, d2, d30
     vmlal.u16 q0, d3, d30
@@ -2671,6 +2664,7 @@ pixman_asm_function fname
     PF_OFFS   .req      r7
     TMP3      .req      r8
     TMP4      .req      r9
+    STRIDE    .req      r2
 
     mov       ip, sp
     push      {r4, r5, r6, r7, r8, r9}
@@ -2678,6 +2672,9 @@ pixman_asm_function fname
     ldmia     ip, {WB, X, UX, WIDTH}
     mul       PF_OFFS, PF_OFFS, UX
 
+    sub       STRIDE, BOTTOM, TOP
+    .unreq    BOTTOM
+
     cmp       WIDTH, #0
     ble       3f
 
@@ -2738,7 +2735,6 @@ pixman_asm_function fname
 
     .unreq    OUT
     .unreq    TOP
-    .unreq    BOTTOM
     .unreq    WT
     .unreq    WB
     .unreq    X
@@ -2749,6 +2745,7 @@ pixman_asm_function fname
     .unreq    PF_OFFS
     .unreq    TMP3
     .unreq    TMP4
+    .unreq    STRIDE
 .endfunc
 
 .endm
-- 
1.6.6.1