From d0044bfbd596f22ed1560579ea6537b39f3dc1af Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Thu, 29 Oct 2009 19:06:42 +0000 Subject: ARM: Don't emit prefetch code if prefetch distance is set to 0 Also it is now possible to disable prefetch globally with a configuration macro --- diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index bca499a..35e6a7e 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -219,33 +219,33 @@ vshrn.u16 d7, q2, #3 vsli.u16 q2, q2, #5 vshll.u8 q14, d16, #8 - add PF_X, PF_X, #8 + PF add PF_X, PF_X, #8 vshll.u8 q8, d19, #8 - tst PF_CTL, #0xF + PF tst PF_CTL, #0xF vsri.u8 d6, d6, #5 - addne PF_X, PF_X, #8 + PF addne PF_X, PF_X, #8 vmvn.8 d3, d3 - subne PF_CTL, PF_CTL, #1 + PF subne PF_CTL, PF_CTL, #1 vsri.u8 d7, d7, #6 vshrn.u16 d30, q2, #2 vmull.u8 q10, d3, d6 - pld [PF_SRC, PF_X, lsl #src_bpp_shift] + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] vmull.u8 q11, d3, d7 vmull.u8 q12, d3, d30 - pld [PF_DST, PF_X, lsl #dst_bpp_shift] + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] vsri.u16 q14, q8, #5 - cmp PF_X, ORIG_W + PF cmp PF_X, ORIG_W vshll.u8 q9, d18, #8 vrshr.u16 q13, q10, #8 - subge PF_X, PF_X, ORIG_W + PF subge PF_X, PF_X, ORIG_W vrshr.u16 q3, q11, #8 vrshr.u16 q15, q12, #8 - subges PF_CTL, PF_CTL, #0x10 + PF subges PF_CTL, PF_CTL, #0x10 vsri.u16 q14, q9, #11 - ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! vraddhn.u16 d20, q10, q13 vraddhn.u16 d23, q11, q3 - ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! vraddhn.u16 d22, q12, q15 vst1.16 {d28, d29}, [DST_W, :128]! .endm @@ -323,20 +323,20 @@ generate_composite_function \ .macro pixman_composite_src_8888_0565_process_pixblock_tail_head vsri.u16 q14, q8, #5 - add PF_X, PF_X, #8 - tst PF_CTL, #0xF + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF vld4.8 {d0, d1, d2, d3}, [SRC]! - addne PF_X, PF_X, #8 - subne PF_CTL, PF_CTL, #1 + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 vsri.u16 q14, q9, #11 - cmp PF_X, ORIG_W - pld [PF_SRC, PF_X, lsl #src_bpp_shift] + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] vshll.u8 q8, d1, #8 vst1.16 {d28, d29}, [DST_W, :128]! - subge PF_X, PF_X, ORIG_W - subges PF_CTL, PF_CTL, #0x10 + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 vshll.u8 q14, d2, #8 - ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! vshll.u8 q9, d0, #8 .endm @@ -363,20 +363,20 @@ generate_composite_function \ .macro pixman_composite_add_8000_8000_process_pixblock_tail_head vld1.8 {d0, d1, d2, d3}, [SRC]! - add PF_X, PF_X, #32 - tst PF_CTL, #0xF + PF add PF_X, PF_X, #32 + PF tst PF_CTL, #0xF vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - addne PF_X, PF_X, #32 - subne PF_CTL, PF_CTL, #1 + PF addne PF_X, PF_X, #32 + PF subne PF_CTL, PF_CTL, #1 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - cmp PF_X, ORIG_W - pld [PF_SRC, PF_X, lsl #src_bpp_shift] - pld [PF_DST, PF_X, lsl #dst_bpp_shift] - subge PF_X, PF_X, ORIG_W - subges PF_CTL, PF_CTL, #0x10 + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 vqadd.u8 q14, q0, q2 - ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! vqadd.u8 q15, q1, q3 .endm @@ -418,32 +418,32 @@ generate_composite_function \ .macro pixman_composite_over_8888_8888_process_pixblock_tail_head vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! vrshr.u16 q14, q8, #8 - add PF_X, PF_X, #8 - tst PF_CTL, #0xF + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF vrshr.u16 q15, q9, #8 vrshr.u16 q12, q10, #8 vrshr.u16 q13, q11, #8 - addne PF_X, PF_X, #8 - subne PF_CTL, PF_CTL, #1 + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 vraddhn.u16 d28, q14, q8 vraddhn.u16 d29, q15, q9 - cmp PF_X, ORIG_W + PF cmp PF_X, ORIG_W vraddhn.u16 d30, q12, q10 vraddhn.u16 d31, q13, q11 vqadd.u8 q14, q0, q14 vqadd.u8 q15, q1, q15 vld4.8 {d0, d1, d2, d3}, [SRC]! - pld [PF_SRC, PF_X, lsl #src_bpp_shift] + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] vmvn.8 d22, d3 - pld [PF_DST, PF_X, lsl #dst_bpp_shift] + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - subge PF_X, PF_X, ORIG_W + PF subge PF_X, PF_X, ORIG_W vmull.u8 q8, d22, d4 - subges PF_CTL, PF_CTL, #0x10 + PF subges PF_CTL, PF_CTL, #0x10 vmull.u8 q9, d22, d5 - ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! vmull.u8 q10, d22, d6 - ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! vmull.u8 q11, d22, d7 .endm diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h index d276ab9..a2941ae 100644 --- a/pixman/pixman-arm-neon-asm.h +++ b/pixman/pixman-arm-neon-asm.h @@ -58,6 +58,11 @@ #define RESPECT_STRICT_ALIGNMENT 1 /* + * If set to nonzero value, prefetch is globally disabled + */ +#define PREFETCH_GLOBALLY_DISABLED 0 + +/* * Definitions of supplementary pixld/pixst macros (for partial load/store of * pixel data) */ @@ -218,37 +223,43 @@ * pixels processing like simple copy. Anyway, having prefetch is a must * when working with graphics data. */ +.macro PF a, x:vararg +.if (ADVANCED_PREFETCH_ENABLED != 0) && (PREFETCH_GLOBALLY_DISABLED == 0) + a x +.endif +.endm + .macro cache_preload std_increment, boost_increment .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) .if regs_shortage - ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ + PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ .endif .if std_increment != 0 - add PF_X, PF_X, #std_increment + PF add PF_X, PF_X, #std_increment .endif - tst PF_CTL, #0xF - addne PF_X, PF_X, #boost_increment - subne PF_CTL, PF_CTL, #1 - cmp PF_X, ORIG_W + PF tst PF_CTL, #0xF + PF addne PF_X, PF_X, #boost_increment + PF subne PF_CTL, PF_CTL, #1 + PF cmp PF_X, ORIG_W .if src_bpp_shift >= 0 - pld [PF_SRC, PF_X, lsl #src_bpp_shift] + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] .endif .if dst_r_bpp != 0 - pld [PF_DST, PF_X, lsl #dst_bpp_shift] + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] .endif .if mask_bpp_shift >= 0 - pld [PF_MASK, PF_X, lsl #mask_bpp_shift] + PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] .endif - subge PF_X, PF_X, ORIG_W - subges PF_CTL, PF_CTL, #0x10 + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 .if src_bpp_shift >= 0 - ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! .endif .if dst_r_bpp != 0 - ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! .endif .if mask_bpp_shift >= 0 - ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! .endif .endif .endm @@ -297,6 +308,12 @@ fname: PF_DST .req r12 PF_MASK .req r14 +.if prefetch_distance == 0 + .set ADVANCED_PREFETCH_ENABLED, 0 +.else + .set ADVANCED_PREFETCH_ENABLED, 1 +.endif + .if mask_bpp == 0 ORIG_W .req r7 /* saved original width */ DUMMY .req r8 /* temporary register */ @@ -374,12 +391,12 @@ fname: ldr MASK_STRIDE, [sp, #52] .endif mov DST_R, DST_W - mov PF_SRC, SRC - mov PF_DST, DST_R - mov PF_MASK, MASK - mov PF_CTL, H, lsl #4 - /* pf_ctl = 10 | ((h - 1) << 4) */ - add PF_CTL, #(prefetch_distance - 0x10) + PF mov PF_SRC, SRC + PF mov PF_DST, DST_R + PF mov PF_MASK, MASK + /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ + PF mov PF_CTL, H, lsl #4 + PF add PF_CTL, #(prefetch_distance - 0x10) init .if regs_shortage @@ -412,7 +429,7 @@ fname: .else add DST_R, DST_R, #lowbit .endif - add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) + PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) sub W, W, #(lowbit * 8 / dst_w_bpp) 1: .endif @@ -444,7 +461,7 @@ fname: (src_basereg - pixblock_size * src_bpp / 64), SRC pixld pixblock_size, mask_bpp, \ (mask_basereg - pixblock_size * mask_bpp / 64), MASK - add PF_X, PF_X, #pixblock_size + PF add PF_X, PF_X, #pixblock_size process_pixblock_head cache_preload 0, pixblock_size subs W, W, #(pixblock_size * 2) @@ -468,7 +485,7 @@ fname: pixld chunk_size, src_bpp, src_basereg, SRC pixld chunk_size, mask_bpp, mask_basereg, MASK pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R - add PF_X, PF_X, #chunk_size + PF add PF_X, PF_X, #chunk_size 1: .endif .endr -- cgit v0.8.2