aboutsummaryrefslogtreecommitdiffstats
path: root/recipes/xorg-lib/pixman-0.21.6/0022-ARM-a-bit-faster-NEON-bilinear-scaling-for-r5g6b5-so.patch
diff options
context:
space:
mode:
Diffstat (limited to 'recipes/xorg-lib/pixman-0.21.6/0022-ARM-a-bit-faster-NEON-bilinear-scaling-for-r5g6b5-so.patch')
-rw-r--r--recipes/xorg-lib/pixman-0.21.6/0022-ARM-a-bit-faster-NEON-bilinear-scaling-for-r5g6b5-so.patch166
1 files changed, 166 insertions, 0 deletions
diff --git a/recipes/xorg-lib/pixman-0.21.6/0022-ARM-a-bit-faster-NEON-bilinear-scaling-for-r5g6b5-so.patch b/recipes/xorg-lib/pixman-0.21.6/0022-ARM-a-bit-faster-NEON-bilinear-scaling-for-r5g6b5-so.patch
new file mode 100644
index 0000000000..20019f45f1
--- /dev/null
+++ b/recipes/xorg-lib/pixman-0.21.6/0022-ARM-a-bit-faster-NEON-bilinear-scaling-for-r5g6b5-so.patch
@@ -0,0 +1,166 @@
+From 70a923882ca24664344ba91a649e7aa12c3063f7 Mon Sep 17 00:00:00 2001
+From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Wed, 9 Mar 2011 13:55:48 +0200
+Subject: [PATCH 22/40] ARM: a bit faster NEON bilinear scaling for r5g6b5 source images
+
+Instructions scheduling improved in the code responsible for fetching r5g6b5
+pixels and converting them to the intermediate x8r8g8b8 color format used in
+the interpolation part of code. Still a lot of NEON stalls are remaining,
+which can be resolved later by the use of pipelining.
+
+Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=10020565, dst=10020565, speed=32.29 MPix/s
+ op=1, src=10020565, dst=20020888, speed=36.82 MPix/s
+ after: op=1, src=10020565, dst=10020565, speed=41.35 MPix/s
+ op=1, src=10020565, dst=20020888, speed=49.16 MPix/s
+---
+ pixman/pixman-arm-neon-asm.S | 118 +++++++++++++++++++++++++++++++++++------
+ 1 files changed, 100 insertions(+), 18 deletions(-)
+
+diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
+index 2b6875b..71b30ac 100644
+--- a/pixman/pixman-arm-neon-asm.S
++++ b/pixman/pixman-arm-neon-asm.S
+@@ -2430,6 +2430,101 @@ fname:
+ convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+ .endm
+
++.macro bilinear_load_and_vertical_interpolate_two_8888 \
++ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
++
++ bilinear_load_8888 reg1, reg2, tmp1
++ vmull.u8 acc1, reg1, d28
++ vmlal.u8 acc1, reg2, d29
++ bilinear_load_8888 reg3, reg4, tmp2
++ vmull.u8 acc2, reg3, d28
++ vmlal.u8 acc2, reg4, d29
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_four_8888 \
++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
++ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++
++ bilinear_load_and_vertical_interpolate_two_8888 \
++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
++ bilinear_load_and_vertical_interpolate_two_8888 \
++ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_two_0565 \
++ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
++
++ mov TMP2, X, asr #16
++ add X, X, UX
++ mov TMP4, X, asr #16
++ add X, X, UX
++ add TMP1, TOP, TMP2, asl #1
++ add TMP2, BOTTOM, TMP2, asl #1
++ add TMP3, TOP, TMP4, asl #1
++ add TMP4, BOTTOM, TMP4, asl #1
++ vld1.32 {acc2lo[0]}, [TMP1]
++ vld1.32 {acc2hi[0]}, [TMP3]
++ vld1.32 {acc2lo[1]}, [TMP2]
++ vld1.32 {acc2hi[1]}, [TMP4]
++ convert_0565_to_x888 acc2, reg3, reg2, reg1
++ vzip.u8 reg1, reg3
++ vzip.u8 reg2, reg4
++ vzip.u8 reg3, reg4
++ vzip.u8 reg1, reg2
++ vmull.u8 acc1, reg1, d28
++ vmlal.u8 acc1, reg2, d29
++ vmull.u8 acc2, reg3, d28
++ vmlal.u8 acc2, reg4, d29
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_four_0565 \
++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
++ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++
++ mov TMP2, X, asr #16
++ add X, X, UX
++ mov TMP4, X, asr #16
++ add X, X, UX
++ add TMP1, TOP, TMP2, asl #1
++ add TMP2, BOTTOM, TMP2, asl #1
++ add TMP3, TOP, TMP4, asl #1
++ add TMP4, BOTTOM, TMP4, asl #1
++ vld1.32 {xacc2lo[0]}, [TMP1]
++ vld1.32 {xacc2hi[0]}, [TMP3]
++ vld1.32 {xacc2lo[1]}, [TMP2]
++ vld1.32 {xacc2hi[1]}, [TMP4]
++ convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
++ mov TMP2, X, asr #16
++ add X, X, UX
++ mov TMP4, X, asr #16
++ add X, X, UX
++ add TMP1, TOP, TMP2, asl #1
++ add TMP2, BOTTOM, TMP2, asl #1
++ add TMP3, TOP, TMP4, asl #1
++ add TMP4, BOTTOM, TMP4, asl #1
++ vld1.32 {yacc2lo[0]}, [TMP1]
++ vzip.u8 xreg1, xreg3
++ vld1.32 {yacc2hi[0]}, [TMP3]
++ vzip.u8 xreg2, xreg4
++ vld1.32 {yacc2lo[1]}, [TMP2]
++ vzip.u8 xreg3, xreg4
++ vld1.32 {yacc2hi[1]}, [TMP4]
++ vzip.u8 xreg1, xreg2
++ convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
++ vmull.u8 xacc1, xreg1, d28
++ vzip.u8 yreg1, yreg3
++ vmlal.u8 xacc1, xreg2, d29
++ vzip.u8 yreg2, yreg4
++ vmull.u8 xacc2, xreg3, d28
++ vzip.u8 yreg3, yreg4
++ vmlal.u8 xacc2, xreg4, d29
++ vzip.u8 yreg1, yreg2
++ vmull.u8 yacc1, yreg1, d28
++ vmlal.u8 yacc1, yreg2, d29
++ vmull.u8 yacc2, yreg3, d28
++ vmlal.u8 yacc2, yreg4, d29
++.endm
++
+ .macro bilinear_store_8888 numpix, tmp1, tmp2
+ .if numpix == 4
+ vst1.32 {d0, d1}, [OUT]!
+@@ -2477,12 +2572,8 @@ fname:
+ .endm
+
+ .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+- bilinear_load_&src_fmt d0, d1, d2
+- vmull.u8 q1, d0, d28
+- vmlal.u8 q1, d1, d29
+- bilinear_load_&src_fmt d20, d21, d22
+- vmull.u8 q11, d20, d28
+- vmlal.u8 q11, d21, d29
++ bilinear_load_and_vertical_interpolate_two_&src_fmt \
++ q1, q11, d0, d1, d20, d21, d22, d23
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q0, d2, #8
+@@ -2498,18 +2589,9 @@ fname:
+ .endm
+
+ .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+- bilinear_load_&src_fmt d0, d1, d2
+- vmull.u8 q1, d0, d28
+- vmlal.u8 q1, d1, d29
+- bilinear_load_&src_fmt d20, d21, d22
+- vmull.u8 q11, d20, d28
+- vmlal.u8 q11, d21, d29
+- bilinear_load_&src_fmt d4, d5, d6
+- vmull.u8 q3, d4, d28
+- vmlal.u8 q3, d5, d29
+- bilinear_load_&src_fmt d16, d17, d18
+- vmull.u8 q9, d16, d28
+- vmlal.u8 q9, d17, d29
++ bilinear_load_and_vertical_interpolate_four_&src_fmt \
++ q1, q11, d0, d1, d20, d21, d22, d23 \
++ q3, q9, d4, d5, d16, d17, d18, d19
+ pld [TMP1, PF_OFFS]
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+--
+1.6.6.1
+