1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
From f3e17872f5522e25da8e32de83e62bee8cc198d7 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 7 Mar 2011 03:10:43 +0200
Subject: [PATCH 14/40] ARM: common macro for nearest scaling fast paths
The code of nearest scaled 'src_0565_0565' function was generalized
and moved to a common macro, so that it can be reused for other
fast paths.
---
pixman/pixman-arm-simd-asm.S | 60 +++++++++++++++++++++++++----------------
1 files changed, 36 insertions(+), 24 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index dd1366d..a9775e2 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -331,15 +331,29 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
.endfunc
/*
- * Note: This function is only using armv4t instructions (not even armv6),
+ * Note: This code is only using armv5te instructions (not even armv6),
* but is scheduled for ARM Cortex-A8 pipeline. So it might need to
* be split into a few variants, tuned for each microarchitecture.
*
* TODO: In order to get good performance on ARM9/ARM11 cores (which don't
* have efficient write combining), it needs to be changed to use 16-byte
* aligned writes using STM instruction.
+ *
+ * Nearest scanline scaler macro template uses the following arguments:
+ * fname - name of the function to generate
+ * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes
+ * t - type suffix for LDR/STR instructions
+ * prefetch_distance - prefetch in the source image by that many
+ * pixels ahead
+ * prefetch_braking_distance - stop prefetching when that many pixels are
+ * remaining before the end of scanline
*/
-pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
+
+.macro generate_nearest_scanline_func fname, bpp_shift, t, \
+ prefetch_distance, \
+ prefetch_braking_distance
+
+pixman_asm_function fname
W .req r0
DST .req r1
SRC .req r2
@@ -352,35 +366,29 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
ldr UNIT_X, [sp]
push {r4, r5, r6, r7}
- mvn VXMASK, #1
+ mvn VXMASK, #((1 << bpp_shift) - 1)
/* define helper macro */
.macro scale_2_pixels
- ldrh TMP1, [SRC, TMP1]
- and TMP2, VXMASK, VX, lsr #15
+ ldr&t TMP1, [SRC, TMP1]
+ and TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
- strh TMP1, [DST], #2
+ str&t TMP1, [DST], #(1 << bpp_shift)
- ldrh TMP2, [SRC, TMP2]
- and TMP1, VXMASK, VX, lsr #15
+ ldr&t TMP2, [SRC, TMP2]
+ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
- strh TMP2, [DST], #2
+ str&t TMP2, [DST], #(1 << bpp_shift)
.endm
- /*
- * stop prefetch before reaching the end of scanline (a good behaving
- * value selected based on some benchmarks with short scanlines)
- */
- #define PREFETCH_BRAKING_DISTANCE 32
-
/* now do the scaling */
- and TMP1, VXMASK, VX, lsr #15
+ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
- subs W, #(8 + PREFETCH_BRAKING_DISTANCE)
+ subs W, W, #(8 + prefetch_braking_distance)
blt 2f
- /* set prefetch distance to 80 pixels ahead */
- add PF_OFFS, VX, UNIT_X, lsl #6
- add PF_OFFS, PF_OFFS, UNIT_X, lsl #4
+ /* calculate prefetch offset */
+ mov PF_OFFS, #prefetch_distance
+ mla PF_OFFS, UNIT_X, PF_OFFS, VX
1: /* main loop, process 8 pixels per iteration with prefetch */
subs W, W, #8
add PF_OFFS, UNIT_X, lsl #3
@@ -388,10 +396,10 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
scale_2_pixels
scale_2_pixels
scale_2_pixels
- pld [SRC, PF_OFFS, lsr #15]
+ pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
bge 1b
2:
- subs W, #(4 - 8 - PREFETCH_BRAKING_DISTANCE)
+ subs W, W, #(4 - 8 - prefetch_braking_distance)
blt 2f
1: /* process the remaining pixels */
scale_2_pixels
@@ -404,8 +412,8 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
scale_2_pixels
2:
tst W, #1
- ldrneh TMP1, [SRC, TMP1]
- strneh TMP1, [DST], #2
+ ldrne&t TMP1, [SRC, TMP1]
+ strne&t TMP1, [DST]
/* cleanup helper macro */
.purgem scale_2_pixels
.unreq DST
@@ -421,3 +429,7 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
pop {r4, r5, r6, r7}
bx lr
.endfunc
+.endm
+
+generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
--
1.6.6.1
|