From d0044bfbd596f22ed1560579ea6537b39f3dc1af Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Thu, 29 Oct 2009 19:06:42 +0000
Subject: ARM: Don't emit prefetch code if prefetch distance is set to 0

Also it is now possible to disable prefetch globally with
a configuration macro
---
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index bca499a..35e6a7e 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -219,33 +219,33 @@
     vshrn.u16   d7, q2, #3
     vsli.u16    q2, q2, #5
         vshll.u8    q14, d16, #8
-                                    add PF_X, PF_X, #8
+                                    PF add PF_X, PF_X, #8
         vshll.u8    q8, d19, #8
-                                    tst PF_CTL, #0xF
+                                    PF tst PF_CTL, #0xF
     vsri.u8     d6, d6, #5
-                                    addne PF_X, PF_X, #8
+                                    PF addne PF_X, PF_X, #8
     vmvn.8      d3, d3
-                                    subne PF_CTL, PF_CTL, #1
+                                    PF subne PF_CTL, PF_CTL, #1
     vsri.u8     d7, d7, #6
     vshrn.u16   d30, q2, #2
     vmull.u8    q10, d3, d6
-                                    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
     vmull.u8    q11, d3, d7
     vmull.u8    q12, d3, d30
-                                    pld [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
         vsri.u16    q14, q8, #5
-                                    cmp PF_X, ORIG_W
+                                    PF cmp PF_X, ORIG_W
         vshll.u8    q9, d18, #8
     vrshr.u16   q13, q10, #8
-                                    subge PF_X, PF_X, ORIG_W
+                                    PF subge PF_X, PF_X, ORIG_W
     vrshr.u16   q3, q11, #8
     vrshr.u16   q15, q12, #8
-                                    subges PF_CTL, PF_CTL, #0x10
+                                    PF subges PF_CTL, PF_CTL, #0x10
         vsri.u16    q14, q9, #11
-                                    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vraddhn.u16 d20, q10, q13
     vraddhn.u16 d23, q11, q3
-                                    ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vraddhn.u16 d22, q12, q15
         vst1.16     {d28, d29}, [DST_W, :128]!
 .endm
@@ -323,20 +323,20 @@ generate_composite_function \
 
 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
         vsri.u16    q14, q8, #5
-                                    add PF_X, PF_X, #8
-                                    tst PF_CTL, #0xF
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
     vld4.8      {d0, d1, d2, d3}, [SRC]!
-                                    addne PF_X, PF_X, #8
-                                    subne PF_CTL, PF_CTL, #1
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
         vsri.u16    q14, q9, #11
-                                    cmp PF_X, ORIG_W
-                                    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
     vshll.u8    q8, d1, #8
         vst1.16     {d28, d29}, [DST_W, :128]!
-                                    subge PF_X, PF_X, ORIG_W
-                                    subges PF_CTL, PF_CTL, #0x10
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
     vshll.u8    q14, d2, #8
-                                    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vshll.u8    q9, d0, #8
 .endm
 
@@ -363,20 +363,20 @@ generate_composite_function \
 
 .macro pixman_composite_add_8000_8000_process_pixblock_tail_head
     vld1.8      {d0, d1, d2, d3}, [SRC]!
-                                    add PF_X, PF_X, #32
-                                    tst PF_CTL, #0xF
+                                    PF add PF_X, PF_X, #32
+                                    PF tst PF_CTL, #0xF
     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-                                    addne PF_X, PF_X, #32
-                                    subne PF_CTL, PF_CTL, #1
+                                    PF addne PF_X, PF_X, #32
+                                    PF subne PF_CTL, PF_CTL, #1
         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    cmp PF_X, ORIG_W
-                                    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    pld [PF_DST, PF_X, lsl #dst_bpp_shift]
-                                    subge PF_X, PF_X, ORIG_W
-                                    subges PF_CTL, PF_CTL, #0x10
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
     vqadd.u8    q14, q0, q2
-                                    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-                                    ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vqadd.u8    q15, q1, q3
 .endm
 
@@ -418,32 +418,32 @@ generate_composite_function \
 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
         vrshr.u16   q14, q8, #8
-                                    add PF_X, PF_X, #8
-                                    tst PF_CTL, #0xF
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
         vrshr.u16   q15, q9, #8
         vrshr.u16   q12, q10, #8
         vrshr.u16   q13, q11, #8
-                                    addne PF_X, PF_X, #8
-                                    subne PF_CTL, PF_CTL, #1
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
         vraddhn.u16 d28, q14, q8
         vraddhn.u16 d29, q15, q9
-                                    cmp PF_X, ORIG_W
+                                    PF cmp PF_X, ORIG_W
         vraddhn.u16 d30, q12, q10
         vraddhn.u16 d31, q13, q11
         vqadd.u8    q14, q0, q14
         vqadd.u8    q15, q1, q15
     vld4.8      {d0, d1, d2, d3}, [SRC]!
-                                    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
     vmvn.8      d22, d3
-                                    pld [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    subge PF_X, PF_X, ORIG_W
+                                    PF subge PF_X, PF_X, ORIG_W
     vmull.u8    q8, d22, d4
-                                    subges PF_CTL, PF_CTL, #0x10
+                                    PF subges PF_CTL, PF_CTL, #0x10
     vmull.u8    q9, d22, d5
-                                    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vmull.u8    q10, d22, d6
-                                    ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vmull.u8    q11, d22, d7
 .endm
 
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index d276ab9..a2941ae 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -58,6 +58,11 @@
 #define RESPECT_STRICT_ALIGNMENT 1
 
 /*
+ * If set to nonzero value, prefetch is globally disabled
+ */
+#define PREFETCH_GLOBALLY_DISABLED 0 
+
+/*
  * Definitions of supplementary pixld/pixst macros (for partial load/store of
  * pixel data)
  */
@@ -218,37 +223,43 @@
  * pixels processing like simple copy. Anyway, having prefetch is a must
  * when working with graphics data.
  */
+.macro PF a, x:vararg
+.if (ADVANCED_PREFETCH_ENABLED != 0) && (PREFETCH_GLOBALLY_DISABLED == 0)
+    a x
+.endif
+.endm
+
 .macro cache_preload std_increment, boost_increment
 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
 .if regs_shortage
-    ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
 .endif
 .if std_increment != 0
-    add PF_X, PF_X, #std_increment
+    PF add PF_X, PF_X, #std_increment
 .endif
-    tst PF_CTL, #0xF
-    addne PF_X, PF_X, #boost_increment
-    subne PF_CTL, PF_CTL, #1
-    cmp PF_X, ORIG_W
+    PF tst PF_CTL, #0xF
+    PF addne PF_X, PF_X, #boost_increment
+    PF subne PF_CTL, PF_CTL, #1
+    PF cmp PF_X, ORIG_W
 .if src_bpp_shift >= 0
-    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
 .endif
 .if dst_r_bpp != 0
-    pld [PF_DST, PF_X, lsl #dst_bpp_shift]
+    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
 .endif
 .if mask_bpp_shift >= 0
-    pld [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
 .endif
-    subge PF_X, PF_X, ORIG_W
-    subges PF_CTL, PF_CTL, #0x10
+    PF subge PF_X, PF_X, ORIG_W
+    PF subges PF_CTL, PF_CTL, #0x10
 .if src_bpp_shift >= 0
-    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 .endif
 .if dst_r_bpp != 0
-    ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
 .endif
 .if mask_bpp_shift >= 0
-    ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
 .endif
 .endif
 .endm
@@ -297,6 +308,12 @@ fname:
     PF_DST      .req        r12
     PF_MASK     .req        r14
 
+.if prefetch_distance == 0
+    .set ADVANCED_PREFETCH_ENABLED, 0
+.else
+    .set ADVANCED_PREFETCH_ENABLED, 1
+.endif
+
 .if mask_bpp == 0
     ORIG_W      .req        r7      /* saved original width */
     DUMMY       .req        r8      /* temporary register */
@@ -374,12 +391,12 @@ fname:
     ldr         MASK_STRIDE, [sp, #52]
 .endif
     mov         DST_R, DST_W
-    mov         PF_SRC, SRC
-    mov         PF_DST, DST_R
-    mov         PF_MASK, MASK
-    mov         PF_CTL, H, lsl #4
-    /* pf_ctl = 10 | ((h - 1) << 4) */
-    add         PF_CTL, #(prefetch_distance - 0x10)
+    PF mov      PF_SRC, SRC
+    PF mov      PF_DST, DST_R
+    PF mov      PF_MASK, MASK
+    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+    PF mov      PF_CTL, H, lsl #4
+    PF add      PF_CTL, #(prefetch_distance - 0x10)
 
     init
 .if regs_shortage
@@ -412,7 +429,7 @@ fname:
 .else
     add         DST_R, DST_R, #lowbit
 .endif
-    add         PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
     sub         W, W, #(lowbit * 8 / dst_w_bpp)
 1:
 .endif
@@ -444,7 +461,7 @@ fname:
                 (src_basereg - pixblock_size * src_bpp / 64), SRC
     pixld       pixblock_size, mask_bpp, \
                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-    add         PF_X, PF_X, #pixblock_size
+    PF add      PF_X, PF_X, #pixblock_size
     process_pixblock_head
     cache_preload 0, pixblock_size
     subs        W, W, #(pixblock_size * 2)
@@ -468,7 +485,7 @@ fname:
     pixld       chunk_size, src_bpp, src_basereg, SRC
     pixld       chunk_size, mask_bpp, mask_basereg, MASK
     pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
-    add         PF_X, PF_X, #chunk_size
+    PF add      PF_X, PF_X, #chunk_size
 1:
 .endif
 .endr
--
cgit v0.8.2