aboutsummaryrefslogtreecommitdiff
path: root/recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch
diff options
context:
space:
mode:
Diffstat (limited to 'recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch')
-rw-r--r--recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch226
1 files changed, 226 insertions, 0 deletions
diff --git a/recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch b/recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch
new file mode 100644
index 0000000..245e536
--- /dev/null
+++ b/recipes/xorg-lib/pixman-0.21.6/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch
@@ -0,0 +1,226 @@
+From 11a0c5badbc59ce967707ef836313cc98f8aec4e Mon Sep 17 00:00:00 2001
+From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Wed, 9 Mar 2011 11:46:48 +0200
+Subject: [PATCH 18/40] ARM: use common macro template for bilinear scaled 'src_8888_8888'
+
+This is a cleanup for old and now duplicated code. The performance improvement
+is mostly coming from the enabled use of software prefetch, but instructions
+scheduling is also slightly better.
+
+Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=20028888, dst=20028888, speed=53.24 MPix/s
+ after: op=1, src=20028888, dst=20028888, speed=74.36 MPix/s
+---
+ pixman/pixman-arm-neon-asm.S | 191 +-----------------------------------------
+ 1 files changed, 3 insertions(+), 188 deletions(-)
+
+diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
+index f3784f5..52dc444 100644
+--- a/pixman/pixman-arm-neon-asm.S
++++ b/pixman/pixman-arm-neon-asm.S
+@@ -2405,194 +2405,6 @@ generate_composite_function_nearest_scanline \
+ fname:
+ .endm
+
+-.macro bilinear_interpolate_last_pixel
+- mov TMP1, X, asr #16
+- mov TMP2, X, asr #16
+- add TMP1, TOP, TMP1, asl #2
+- add TMP2, BOTTOM, TMP2, asl #2
+- vld1.32 {d0}, [TMP1]
+- vshr.u16 d30, d24, #8
+- vld1.32 {d1}, [TMP2]
+- vmull.u8 q1, d0, d28
+- vmlal.u8 q1, d1, d29
+- /* 5 cycles bubble */
+- vshll.u16 q0, d2, #8
+- vmlsl.u16 q0, d2, d30
+- vmlal.u16 q0, d3, d30
+- /* 5 cycles bubble */
+- vshrn.u32 d0, q0, #16
+- /* 3 cycles bubble */
+- vmovn.u16 d0, q0
+- /* 1 cycle bubble */
+- vst1.32 {d0[0]}, [OUT, :32]!
+-.endm
+-
+-.macro bilinear_interpolate_two_pixels
+- mov TMP1, X, asr #16
+- mov TMP2, X, asr #16
+- add X, X, UX
+- add TMP1, TOP, TMP1, asl #2
+- add TMP2, BOTTOM, TMP2, asl #2
+- vld1.32 {d0}, [TMP1]
+- vld1.32 {d1}, [TMP2]
+- vmull.u8 q1, d0, d28
+- vmlal.u8 q1, d1, d29
+- mov TMP1, X, asr #16
+- mov TMP2, X, asr #16
+- add X, X, UX
+- add TMP1, TOP, TMP1, asl #2
+- add TMP2, BOTTOM, TMP2, asl #2
+- vld1.32 {d20}, [TMP1]
+- vld1.32 {d21}, [TMP2]
+- vmull.u8 q11, d20, d28
+- vmlal.u8 q11, d21, d29
+- vshr.u16 q15, q12, #8
+- vadd.u16 q12, q12, q13
+- vshll.u16 q0, d2, #8
+- vmlsl.u16 q0, d2, d30
+- vmlal.u16 q0, d3, d30
+- vshll.u16 q10, d22, #8
+- vmlsl.u16 q10, d22, d31
+- vmlal.u16 q10, d23, d31
+- vshrn.u32 d30, q0, #16
+- vshrn.u32 d31, q10, #16
+- vmovn.u16 d0, q15
+- vst1.32 {d0}, [OUT]!
+-.endm
+-
+-.macro bilinear_interpolate_four_pixels
+- mov TMP1, X, asr #16
+- mov TMP2, X, asr #16
+- add X, X, UX
+- add TMP1, TOP, TMP1, asl #2
+- add TMP2, BOTTOM, TMP2, asl #2
+- vld1.32 {d0}, [TMP1]
+- vld1.32 {d1}, [TMP2]
+- vmull.u8 q1, d0, d28
+- vmlal.u8 q1, d1, d29
+- mov TMP1, X, asr #16
+- mov TMP2, X, asr #16
+- add X, X, UX
+- add TMP1, TOP, TMP1, asl #2
+- add TMP2, BOTTOM, TMP2, asl #2
+- vld1.32 {d20}, [TMP1]
+- vld1.32 {d21}, [TMP2]
+- vmull.u8 q11, d20, d28
+- vmlal.u8 q11, d21, d29
+- vshr.u16 q15, q12, #8
+- vadd.u16 q12, q12, q13
+- vshll.u16 q0, d2, #8
+- vmlsl.u16 q0, d2, d30
+- vmlal.u16 q0, d3, d30
+- vshll.u16 q10, d22, #8
+- vmlsl.u16 q10, d22, d31
+- vmlal.u16 q10, d23, d31
+- mov TMP1, X, asr #16
+- mov TMP2, X, asr #16
+- add X, X, UX
+- add TMP1, TOP, TMP1, asl #2
+- add TMP2, BOTTOM, TMP2, asl #2
+- vld1.32 {d4}, [TMP1]
+- vld1.32 {d5}, [TMP2]
+- vmull.u8 q3, d4, d28
+- vmlal.u8 q3, d5, d29
+- mov TMP1, X, asr #16
+- mov TMP2, X, asr #16
+- add X, X, UX
+- add TMP1, TOP, TMP1, asl #2
+- add TMP2, BOTTOM, TMP2, asl #2
+- vld1.32 {d16}, [TMP1]
+- vld1.32 {d17}, [TMP2]
+- vmull.u8 q9, d16, d28
+- vmlal.u8 q9, d17, d29
+- vshr.u16 q15, q12, #8
+- vadd.u16 q12, q12, q13
+- vshll.u16 q2, d6, #8
+- vmlsl.u16 q2, d6, d30
+- vmlal.u16 q2, d7, d30
+- vshll.u16 q8, d18, #8
+- vmlsl.u16 q8, d18, d31
+- vmlal.u16 q8, d19, d31
+- vshrn.u32 d0, q0, #16
+- vshrn.u32 d1, q10, #16
+- vshrn.u32 d4, q2, #16
+- vshrn.u32 d5, q8, #16
+- vmovn.u16 d0, q0
+- vmovn.u16 d1, q2
+- vst1.32 {d0, d1}, [OUT]!
+-.endm
+-
+-
+-/*
+- * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t * out,
+- * const uint32_t * top,
+- * const uint32_t * bottom,
+- * int wt,
+- * int wb,
+- * pixman_fixed_t x,
+- * pixman_fixed_t ux,
+- * int width)
+- */
+-
+-pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon
+- OUT .req r0
+- TOP .req r1
+- BOTTOM .req r2
+- WT .req r3
+- WB .req r4
+- X .req r5
+- UX .req r6
+- WIDTH .req ip
+- TMP1 .req r3
+- TMP2 .req r4
+-
+- mov ip, sp
+- push {r4, r5, r6, r7}
+- ldmia ip, {WB, X, UX, WIDTH}
+-
+- cmp WIDTH, #0
+- ble 3f
+- vdup.u16 q12, X
+- vdup.u16 q13, UX
+- vdup.u8 d28, WT
+- vdup.u8 d29, WB
+- vadd.u16 d25, d25, d26
+- vadd.u16 q13, q13, q13
+-
+- subs WIDTH, WIDTH, #4
+- blt 1f
+-0:
+- bilinear_interpolate_four_pixels
+- subs WIDTH, WIDTH, #4
+- bge 0b
+-1:
+- tst WIDTH, #2
+- beq 2f
+- bilinear_interpolate_two_pixels
+-2:
+- tst WIDTH, #1
+- beq 3f
+- bilinear_interpolate_last_pixel
+-3:
+- pop {r4, r5, r6, r7}
+- bx lr
+-
+- .unreq OUT
+- .unreq TOP
+- .unreq BOTTOM
+- .unreq WT
+- .unreq WB
+- .unreq X
+- .unreq UX
+- .unreq WIDTH
+- .unreq TMP1
+- .unreq TMP2
+-.endfunc
+-
+-.purgem bilinear_interpolate_last_pixel
+-.purgem bilinear_interpolate_two_pixels
+-.purgem bilinear_interpolate_four_pixels
+-
+ /*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+@@ -2810,3 +2622,6 @@ pixman_asm_function fname
+ .endfunc
+
+ .endm
++
++generate_bilinear_scanline_func \
++ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28
+--
+1.6.6.1
+