diff options
Diffstat (limited to 'recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch')
-rw-r--r-- | recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch | 288 |
1 files changed, 288 insertions, 0 deletions
diff --git a/recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch b/recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch new file mode 100644 index 0000000000..4d411625ae --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch @@ -0,0 +1,288 @@ +From 17feaa9c50bb8521b0366345efe181bd99754957 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Tue, 22 Feb 2011 18:45:03 +0200 +Subject: [PATCH 10/40] ARM: NEON optimization for bilinear scaled 'src_8888_8888' + +Initial NEON optimization for bilinear scaling. Can be probably +improved more. + +Benchmark on ARM Cortex-A8: + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=6.70 MPix/s + after: op=1, src=20028888, dst=20028888, speed=44.27 MPix/s +--- + pixman/pixman-arm-neon-asm.S | 197 ++++++++++++++++++++++++++++++++++++++++++ + pixman/pixman-arm-neon.c | 45 ++++++++++ + 2 files changed, 242 insertions(+), 0 deletions(-) + +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index 47daf45..c168e10 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -2391,3 +2391,200 @@ generate_composite_function_nearest_scanline \ + 10, /* dst_r_basereg */ \ + 8, /* src_basereg */ \ + 15 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++/* Supplementary macro for setting function attributes */ ++.macro pixman_asm_function fname ++ .func fname ++ .global fname ++#ifdef __ELF__ ++ .hidden fname ++ .type fname, %function ++#endif ++fname: ++.endm ++ ++.macro bilinear_interpolate_last_pixel ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d0}, [TMP1] ++ vshr.u16 d30, d24, #8 ++ vld1.32 {d1}, [TMP2] ++ vmull.u8 q1, d0, d28 ++ vmlal.u8 q1, d1, d29 ++ /* 5 cycles bubble */ ++ vshll.u16 q0, d2, #8 ++ vmlsl.u16 q0, d2, d30 ++ vmlal.u16 q0, d3, d30 ++ /* 5 cycles bubble */ ++ vshrn.u32 d0, q0, #16 ++ /* 3 cycles bubble */ ++ vmovn.u16 d0, q0 ++ /* 1 cycle bubble */ ++ vst1.32 {d0[0]}, [OUT, :32]! ++.endm ++ ++.macro bilinear_interpolate_two_pixels ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d0}, [TMP1] ++ vld1.32 {d1}, [TMP2] ++ vmull.u8 q1, d0, d28 ++ vmlal.u8 q1, d1, d29 ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d20}, [TMP1] ++ vld1.32 {d21}, [TMP2] ++ vmull.u8 q11, d20, d28 ++ vmlal.u8 q11, d21, d29 ++ vshr.u16 q15, q12, #8 ++ vadd.u16 q12, q12, q13 ++ vshll.u16 q0, d2, #8 ++ vmlsl.u16 q0, d2, d30 ++ vmlal.u16 q0, d3, d30 ++ vshll.u16 q10, d22, #8 ++ vmlsl.u16 q10, d22, d31 ++ vmlal.u16 q10, d23, d31 ++ vshrn.u32 d30, q0, #16 ++ vshrn.u32 d31, q10, #16 ++ vmovn.u16 d0, q15 ++ vst1.32 {d0}, [OUT]! ++.endm ++ ++.macro bilinear_interpolate_four_pixels ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d0}, [TMP1] ++ vld1.32 {d1}, [TMP2] ++ vmull.u8 q1, d0, d28 ++ vmlal.u8 q1, d1, d29 ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d20}, [TMP1] ++ vld1.32 {d21}, [TMP2] ++ vmull.u8 q11, d20, d28 ++ vmlal.u8 q11, d21, d29 ++ vshr.u16 q15, q12, #8 ++ vadd.u16 q12, q12, q13 ++ vshll.u16 q0, d2, #8 ++ vmlsl.u16 q0, d2, d30 ++ vmlal.u16 q0, d3, d30 ++ vshll.u16 q10, d22, #8 ++ vmlsl.u16 q10, d22, d31 ++ vmlal.u16 q10, d23, d31 ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d4}, [TMP1] ++ vld1.32 {d5}, [TMP2] ++ vmull.u8 q3, d4, d28 ++ vmlal.u8 q3, d5, d29 ++ mov TMP1, X, asr #16 ++ mov TMP2, X, asr #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, asl #2 ++ add TMP2, BOTTOM, TMP2, asl #2 ++ vld1.32 {d16}, [TMP1] ++ vld1.32 {d17}, [TMP2] ++ vmull.u8 q9, d16, d28 ++ vmlal.u8 q9, d17, d29 ++ vshr.u16 q15, q12, #8 ++ vadd.u16 q12, q12, q13 ++ vshll.u16 q2, d6, #8 ++ vmlsl.u16 q2, d6, d30 ++ vmlal.u16 q2, d7, d30 ++ vshll.u16 q8, d18, #8 ++ vmlsl.u16 q8, d18, d31 ++ vmlal.u16 q8, d19, d31 ++ vshrn.u32 d0, q0, #16 ++ vshrn.u32 d1, q10, #16 ++ vshrn.u32 d4, q2, #16 ++ vshrn.u32 d5, q8, #16 ++ vmovn.u16 d0, q0 ++ vmovn.u16 d1, q2 ++ vst1.32 {d0, d1}, [OUT]! ++.endm ++ ++ ++/* ++ * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t * out, ++ * const uint32_t * top, ++ * const uint32_t * bottom, ++ * int wt, ++ * int wb, ++ * pixman_fixed_t x, ++ * pixman_fixed_t ux, ++ * int width) ++ */ ++ ++pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon ++ OUT .req r0 ++ TOP .req r1 ++ BOTTOM .req r2 ++ WT .req r3 ++ WB .req r4 ++ X .req r5 ++ UX .req r6 ++ WIDTH .req ip ++ TMP1 .req r3 ++ TMP2 .req r4 ++ ++ mov ip, sp ++ push {r4, r5, r6, r7} ++ ldmia ip, {WB, X, UX, WIDTH} ++ ++ cmp WIDTH, #0 ++ ble 3f ++ vdup.u16 q12, X ++ vdup.u16 q13, UX ++ vdup.u8 d28, WT ++ vdup.u8 d29, WB ++ vadd.u16 d25, d25, d26 ++ vadd.u16 q13, q13, q13 ++ ++ subs WIDTH, WIDTH, #4 ++ blt 1f ++0: ++ bilinear_interpolate_four_pixels ++ subs WIDTH, WIDTH, #4 ++ bge 0b ++1: ++ tst WIDTH, #2 ++ beq 2f ++ bilinear_interpolate_two_pixels ++2: ++ tst WIDTH, #1 ++ beq 3f ++ bilinear_interpolate_last_pixel ++3: ++ pop {r4, r5, r6, r7} ++ bx lr ++ ++ .unreq OUT ++ .unreq TOP ++ .unreq BOTTOM ++ .unreq WT ++ .unreq WB ++ .unreq X ++ .unreq UX ++ .unreq WIDTH ++ .unreq TMP1 ++ .unreq TMP2 ++.endfunc +diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c +index 3e0c0d1..c7c0254 100644 +--- a/pixman/pixman-arm-neon.c ++++ b/pixman/pixman-arm-neon.c +@@ -232,6 +232,47 @@ pixman_blt_neon (uint32_t *src_bits, + } + } + ++void ++pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (uint32_t * out, ++ const uint32_t * top, ++ const uint32_t * bottom, ++ int wt, ++ int wb, ++ pixman_fixed_t x, ++ pixman_fixed_t ux, ++ int width); ++ ++static force_inline void ++scaled_bilinear_scanline_neon_8888_8888_SRC (uint32_t * dst, ++ const uint32_t * mask, ++ const uint32_t * src_top, ++ const uint32_t * src_bottom, ++ int32_t w, ++ int wt, ++ int wb, ++ pixman_fixed_t vx, ++ pixman_fixed_t unit_x, ++ pixman_fixed_t max_vx, ++ pixman_bool_t zero_src) ++{ ++ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top, ++ src_bottom, wt, wb, ++ vx, unit_x, w); ++} ++ ++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_cover_SRC, ++ scaled_bilinear_scanline_neon_8888_8888_SRC, ++ uint32_t, uint32_t, uint32_t, ++ COVER, FALSE, FALSE) ++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_pad_SRC, ++ scaled_bilinear_scanline_neon_8888_8888_SRC, ++ uint32_t, uint32_t, uint32_t, ++ PAD, FALSE, FALSE) ++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_none_SRC, ++ scaled_bilinear_scanline_neon_8888_8888_SRC, ++ uint32_t, uint32_t, uint32_t, ++ NONE, FALSE, FALSE) ++ + static const pixman_fast_path_t arm_neon_fast_paths[] = + { + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565), +@@ -343,6 +384,10 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = + PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565), + PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565), + ++ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888), ++ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888), ++ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888), ++ + { PIXMAN_OP_NONE }, + }; + +-- +1.6.6.1 + |