From f3e17872f5522e25da8e32de83e62bee8cc198d7 Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Mon, 7 Mar 2011 03:10:43 +0200 Subject: [PATCH 14/40] ARM: common macro for nearest scaling fast paths The code of nearest scaled 'src_0565_0565' function was generalized and moved to a common macro, so that it can be reused for other fast paths. --- pixman/pixman-arm-simd-asm.S | 60 +++++++++++++++++++++++++---------------- 1 files changed, 36 insertions(+), 24 deletions(-) diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S index dd1366d..a9775e2 100644 --- a/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman-arm-simd-asm.S @@ -331,15 +331,29 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6 .endfunc /* - * Note: This function is only using armv4t instructions (not even armv6), + * Note: This code is only using armv5te instructions (not even armv6), * but is scheduled for ARM Cortex-A8 pipeline. So it might need to * be split into a few variants, tuned for each microarchitecture. * * TODO: In order to get good performance on ARM9/ARM11 cores (which don't * have efficient write combining), it needs to be changed to use 16-byte * aligned writes using STM instruction. + * + * Nearest scanline scaler macro template uses the following arguments: + * fname - name of the function to generate + * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes + * t - type suffix for LDR/STR instructions + * prefetch_distance - prefetch in the source image by that many + * pixels ahead + * prefetch_braking_distance - stop prefetching when that many pixels are + * remaining before the end of scanline */ -pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 + +.macro generate_nearest_scanline_func fname, bpp_shift, t, \ + prefetch_distance, \ + prefetch_braking_distance + +pixman_asm_function fname W .req r0 DST .req r1 SRC .req r2 @@ -352,35 +366,29 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 ldr UNIT_X, [sp] push {r4, r5, r6, r7} - mvn VXMASK, #1 + mvn VXMASK, #((1 << bpp_shift) - 1) /* define helper macro */ .macro scale_2_pixels - ldrh TMP1, [SRC, TMP1] - and TMP2, VXMASK, VX, lsr #15 + ldr&t TMP1, [SRC, TMP1] + and TMP2, VXMASK, VX, lsr #(16 - bpp_shift) add VX, VX, UNIT_X - strh TMP1, [DST], #2 + str&t TMP1, [DST], #(1 << bpp_shift) - ldrh TMP2, [SRC, TMP2] - and TMP1, VXMASK, VX, lsr #15 + ldr&t TMP2, [SRC, TMP2] + and TMP1, VXMASK, VX, lsr #(16 - bpp_shift) add VX, VX, UNIT_X - strh TMP2, [DST], #2 + str&t TMP2, [DST], #(1 << bpp_shift) .endm - /* - * stop prefetch before reaching the end of scanline (a good behaving - * value selected based on some benchmarks with short scanlines) - */ - #define PREFETCH_BRAKING_DISTANCE 32 - /* now do the scaling */ - and TMP1, VXMASK, VX, lsr #15 + and TMP1, VXMASK, VX, lsr #(16 - bpp_shift) add VX, VX, UNIT_X - subs W, #(8 + PREFETCH_BRAKING_DISTANCE) + subs W, W, #(8 + prefetch_braking_distance) blt 2f - /* set prefetch distance to 80 pixels ahead */ - add PF_OFFS, VX, UNIT_X, lsl #6 - add PF_OFFS, PF_OFFS, UNIT_X, lsl #4 + /* calculate prefetch offset */ + mov PF_OFFS, #prefetch_distance + mla PF_OFFS, UNIT_X, PF_OFFS, VX 1: /* main loop, process 8 pixels per iteration with prefetch */ subs W, W, #8 add PF_OFFS, UNIT_X, lsl #3 @@ -388,10 +396,10 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 scale_2_pixels scale_2_pixels scale_2_pixels - pld [SRC, PF_OFFS, lsr #15] + pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)] bge 1b 2: - subs W, #(4 - 8 - PREFETCH_BRAKING_DISTANCE) + subs W, W, #(4 - 8 - prefetch_braking_distance) blt 2f 1: /* process the remaining pixels */ scale_2_pixels @@ -404,8 +412,8 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 scale_2_pixels 2: tst W, #1 - ldrneh TMP1, [SRC, TMP1] - strneh TMP1, [DST], #2 + ldrne&t TMP1, [SRC, TMP1] + strne&t TMP1, [DST] /* cleanup helper macro */ .purgem scale_2_pixels .unreq DST @@ -421,3 +429,7 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6 pop {r4, r5, r6, r7} bx lr .endfunc +.endm + +generate_nearest_scanline_func \ + pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 -- 1.6.6.1