aboutsummaryrefslogtreecommitdiffstats
path: root/recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch
diff options
context:
space:
mode:
Diffstat (limited to 'recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch')
-rw-r--r--recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch288
1 files changed, 288 insertions, 0 deletions
diff --git a/recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch b/recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch
new file mode 100644
index 0000000000..4d411625ae
--- /dev/null
+++ b/recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch
@@ -0,0 +1,288 @@
+From 17feaa9c50bb8521b0366345efe181bd99754957 Mon Sep 17 00:00:00 2001
+From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Tue, 22 Feb 2011 18:45:03 +0200
+Subject: [PATCH 10/40] ARM: NEON optimization for bilinear scaled 'src_8888_8888'
+
+Initial NEON optimization for bilinear scaling. Can be probably
+improved more.
+
+Benchmark on ARM Cortex-A8:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=20028888, dst=20028888, speed=6.70 MPix/s
+ after: op=1, src=20028888, dst=20028888, speed=44.27 MPix/s
+---
+ pixman/pixman-arm-neon-asm.S | 197 ++++++++++++++++++++++++++++++++++++++++++
+ pixman/pixman-arm-neon.c | 45 ++++++++++
+ 2 files changed, 242 insertions(+), 0 deletions(-)
+
+diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
+index 47daf45..c168e10 100644
+--- a/pixman/pixman-arm-neon-asm.S
++++ b/pixman/pixman-arm-neon-asm.S
+@@ -2391,3 +2391,200 @@ generate_composite_function_nearest_scanline \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
++
++/******************************************************************************/
++
++/* Supplementary macro for setting function attributes */
++.macro pixman_asm_function fname
++ .func fname
++ .global fname
++#ifdef __ELF__
++ .hidden fname
++ .type fname, %function
++#endif
++fname:
++.endm
++
++.macro bilinear_interpolate_last_pixel
++ mov TMP1, X, asr #16
++ mov TMP2, X, asr #16
++ add TMP1, TOP, TMP1, asl #2
++ add TMP2, BOTTOM, TMP2, asl #2
++ vld1.32 {d0}, [TMP1]
++ vshr.u16 d30, d24, #8
++ vld1.32 {d1}, [TMP2]
++ vmull.u8 q1, d0, d28
++ vmlal.u8 q1, d1, d29
++ /* 5 cycles bubble */
++ vshll.u16 q0, d2, #8
++ vmlsl.u16 q0, d2, d30
++ vmlal.u16 q0, d3, d30
++ /* 5 cycles bubble */
++ vshrn.u32 d0, q0, #16
++ /* 3 cycles bubble */
++ vmovn.u16 d0, q0
++ /* 1 cycle bubble */
++ vst1.32 {d0[0]}, [OUT, :32]!
++.endm
++
++.macro bilinear_interpolate_two_pixels
++ mov TMP1, X, asr #16
++ mov TMP2, X, asr #16
++ add X, X, UX
++ add TMP1, TOP, TMP1, asl #2
++ add TMP2, BOTTOM, TMP2, asl #2
++ vld1.32 {d0}, [TMP1]
++ vld1.32 {d1}, [TMP2]
++ vmull.u8 q1, d0, d28
++ vmlal.u8 q1, d1, d29
++ mov TMP1, X, asr #16
++ mov TMP2, X, asr #16
++ add X, X, UX
++ add TMP1, TOP, TMP1, asl #2
++ add TMP2, BOTTOM, TMP2, asl #2
++ vld1.32 {d20}, [TMP1]
++ vld1.32 {d21}, [TMP2]
++ vmull.u8 q11, d20, d28
++ vmlal.u8 q11, d21, d29
++ vshr.u16 q15, q12, #8
++ vadd.u16 q12, q12, q13
++ vshll.u16 q0, d2, #8
++ vmlsl.u16 q0, d2, d30
++ vmlal.u16 q0, d3, d30
++ vshll.u16 q10, d22, #8
++ vmlsl.u16 q10, d22, d31
++ vmlal.u16 q10, d23, d31
++ vshrn.u32 d30, q0, #16
++ vshrn.u32 d31, q10, #16
++ vmovn.u16 d0, q15
++ vst1.32 {d0}, [OUT]!
++.endm
++
++.macro bilinear_interpolate_four_pixels
++ mov TMP1, X, asr #16
++ mov TMP2, X, asr #16
++ add X, X, UX
++ add TMP1, TOP, TMP1, asl #2
++ add TMP2, BOTTOM, TMP2, asl #2
++ vld1.32 {d0}, [TMP1]
++ vld1.32 {d1}, [TMP2]
++ vmull.u8 q1, d0, d28
++ vmlal.u8 q1, d1, d29
++ mov TMP1, X, asr #16
++ mov TMP2, X, asr #16
++ add X, X, UX
++ add TMP1, TOP, TMP1, asl #2
++ add TMP2, BOTTOM, TMP2, asl #2
++ vld1.32 {d20}, [TMP1]
++ vld1.32 {d21}, [TMP2]
++ vmull.u8 q11, d20, d28
++ vmlal.u8 q11, d21, d29
++ vshr.u16 q15, q12, #8
++ vadd.u16 q12, q12, q13
++ vshll.u16 q0, d2, #8
++ vmlsl.u16 q0, d2, d30
++ vmlal.u16 q0, d3, d30
++ vshll.u16 q10, d22, #8
++ vmlsl.u16 q10, d22, d31
++ vmlal.u16 q10, d23, d31
++ mov TMP1, X, asr #16
++ mov TMP2, X, asr #16
++ add X, X, UX
++ add TMP1, TOP, TMP1, asl #2
++ add TMP2, BOTTOM, TMP2, asl #2
++ vld1.32 {d4}, [TMP1]
++ vld1.32 {d5}, [TMP2]
++ vmull.u8 q3, d4, d28
++ vmlal.u8 q3, d5, d29
++ mov TMP1, X, asr #16
++ mov TMP2, X, asr #16
++ add X, X, UX
++ add TMP1, TOP, TMP1, asl #2
++ add TMP2, BOTTOM, TMP2, asl #2
++ vld1.32 {d16}, [TMP1]
++ vld1.32 {d17}, [TMP2]
++ vmull.u8 q9, d16, d28
++ vmlal.u8 q9, d17, d29
++ vshr.u16 q15, q12, #8
++ vadd.u16 q12, q12, q13
++ vshll.u16 q2, d6, #8
++ vmlsl.u16 q2, d6, d30
++ vmlal.u16 q2, d7, d30
++ vshll.u16 q8, d18, #8
++ vmlsl.u16 q8, d18, d31
++ vmlal.u16 q8, d19, d31
++ vshrn.u32 d0, q0, #16
++ vshrn.u32 d1, q10, #16
++ vshrn.u32 d4, q2, #16
++ vshrn.u32 d5, q8, #16
++ vmovn.u16 d0, q0
++ vmovn.u16 d1, q2
++ vst1.32 {d0, d1}, [OUT]!
++.endm
++
++
++/*
++ * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t * out,
++ * const uint32_t * top,
++ * const uint32_t * bottom,
++ * int wt,
++ * int wb,
++ * pixman_fixed_t x,
++ * pixman_fixed_t ux,
++ * int width)
++ */
++
++pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon
++ OUT .req r0
++ TOP .req r1
++ BOTTOM .req r2
++ WT .req r3
++ WB .req r4
++ X .req r5
++ UX .req r6
++ WIDTH .req ip
++ TMP1 .req r3
++ TMP2 .req r4
++
++ mov ip, sp
++ push {r4, r5, r6, r7}
++ ldmia ip, {WB, X, UX, WIDTH}
++
++ cmp WIDTH, #0
++ ble 3f
++ vdup.u16 q12, X
++ vdup.u16 q13, UX
++ vdup.u8 d28, WT
++ vdup.u8 d29, WB
++ vadd.u16 d25, d25, d26
++ vadd.u16 q13, q13, q13
++
++ subs WIDTH, WIDTH, #4
++ blt 1f
++0:
++ bilinear_interpolate_four_pixels
++ subs WIDTH, WIDTH, #4
++ bge 0b
++1:
++ tst WIDTH, #2
++ beq 2f
++ bilinear_interpolate_two_pixels
++2:
++ tst WIDTH, #1
++ beq 3f
++ bilinear_interpolate_last_pixel
++3:
++ pop {r4, r5, r6, r7}
++ bx lr
++
++ .unreq OUT
++ .unreq TOP
++ .unreq BOTTOM
++ .unreq WT
++ .unreq WB
++ .unreq X
++ .unreq UX
++ .unreq WIDTH
++ .unreq TMP1
++ .unreq TMP2
++.endfunc
+diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
+index 3e0c0d1..c7c0254 100644
+--- a/pixman/pixman-arm-neon.c
++++ b/pixman/pixman-arm-neon.c
+@@ -232,6 +232,47 @@ pixman_blt_neon (uint32_t *src_bits,
+ }
+ }
+
++void
++pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (uint32_t * out,
++ const uint32_t * top,
++ const uint32_t * bottom,
++ int wt,
++ int wb,
++ pixman_fixed_t x,
++ pixman_fixed_t ux,
++ int width);
++
++static force_inline void
++scaled_bilinear_scanline_neon_8888_8888_SRC (uint32_t * dst,
++ const uint32_t * mask,
++ const uint32_t * src_top,
++ const uint32_t * src_bottom,
++ int32_t w,
++ int wt,
++ int wb,
++ pixman_fixed_t vx,
++ pixman_fixed_t unit_x,
++ pixman_fixed_t max_vx,
++ pixman_bool_t zero_src)
++{
++ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top,
++ src_bottom, wt, wb,
++ vx, unit_x, w);
++}
++
++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_cover_SRC,
++ scaled_bilinear_scanline_neon_8888_8888_SRC,
++ uint32_t, uint32_t, uint32_t,
++ COVER, FALSE, FALSE)
++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_pad_SRC,
++ scaled_bilinear_scanline_neon_8888_8888_SRC,
++ uint32_t, uint32_t, uint32_t,
++ PAD, FALSE, FALSE)
++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_none_SRC,
++ scaled_bilinear_scanline_neon_8888_8888_SRC,
++ uint32_t, uint32_t, uint32_t,
++ NONE, FALSE, FALSE)
++
+ static const pixman_fast_path_t arm_neon_fast_paths[] =
+ {
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565),
+@@ -343,6 +384,10 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
+ PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
+
++ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888),
++ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
++ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
++
+ { PIXMAN_OP_NONE },
+ };
+
+--
+1.6.6.1
+