aboutsummaryrefslogtreecommitdiffstats
path: root/recipes/xorg-lib/pixman-0.21.6/0034-ARM-support-different-levels-of-loop-unrolling-in-bi.patch
blob: 82661f08691bc3f6186a305469a7cead53bf2925 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
From cd20ceb7602348ecbfa0db1756dc548a0bad3c9d Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Thu, 17 Mar 2011 19:42:01 +0200
Subject: [PATCH 34/40] ARM: support different levels of loop unrolling in bilinear scaler

Now an extra 'flag' parameter is supported in bilinear scaline scaling
function generation macro. It can be used to enable 4 or 8 pixels per
loop iteration unrolling and provide save/restore code for d8-d15
registers.
---
 pixman/pixman-arm-neon-asm.S |   84 ++++++++++++++++++++++++++++++++++++++----
 1 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 9878bf7..6141770 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2633,6 +2633,36 @@ fname:
 .endif
 .endm
 
+.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
+.else
+    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
+.else
+    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.set BILINEAR_FLAG_UNROLL_4,          0
+.set BILINEAR_FLAG_UNROLL_8,          1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
 /*
  * Main template macro for generating NEON optimized bilinear scanline
  * functions.
@@ -2648,7 +2678,7 @@ fname:
 
 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
                                        src_bpp_shift, dst_bpp_shift, \
-                                       prefetch_distance
+                                       prefetch_distance, flags
 
 pixman_asm_function fname
     OUT       .req      r0
@@ -2672,6 +2702,10 @@ pixman_asm_function fname
     ldmia     ip, {WB, X, UX, WIDTH}
     mul       PF_OFFS, PF_OFFS, UX
 
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpush     {d8-d15}
+.endif
+
     sub       STRIDE, BOTTOM, TOP
     .unreq    BOTTOM
 
@@ -2705,8 +2739,34 @@ pixman_asm_function fname
     bilinear_interpolate_two_pixels src_fmt, dst_fmt
     sub       WIDTH, WIDTH, #2
 0:
-
-    /* start the main loop */
+.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
+/*********** 8 pixels per iteration *****************/
+    cmp       WIDTH, #4
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    beq       0f
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #4
+0:
+    subs      WIDTH, WIDTH, #8
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #8
+    blt       5f
+0:
+    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #8
+    bge       0b
+5:
+    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+1:
+    tst       WIDTH, #4
+    beq       2f
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+2:
+.else
+/*********** 4 pixels per iteration *****************/
     subs      WIDTH, WIDTH, #4
     blt       1f
     mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
@@ -2720,7 +2780,8 @@ pixman_asm_function fname
 5:
     bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
 1:
-
+/****************************************************/
+.endif
     /* handle the remaining trailing pixels */
     tst       WIDTH, #2
     beq       2f
@@ -2730,6 +2791,9 @@ pixman_asm_function fname
     beq       3f
     bilinear_interpolate_last_pixel src_fmt, dst_fmt
 3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpop      {d8-d15}
+.endif
     pop       {r4, r5, r6, r7, r8, r9}
     bx        lr
 
@@ -2751,13 +2815,17 @@ pixman_asm_function fname
 .endm
 
 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 2, 28
+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
+    2, 2, 28, BILINEAR_FLAG_UNROLL_4
 
 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 1, 28
+    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
+    2, 1, 28, BILINEAR_FLAG_UNROLL_4
 
 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 2, 28
+    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
+    1, 2, 28, BILINEAR_FLAG_UNROLL_4
 
 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 1, 28
+    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
+    1, 1, 28, BILINEAR_FLAG_UNROLL_4
-- 
1.6.6.1