aboutsummaryrefslogtreecommitdiffstats
path: root/recipes/xorg-lib/pixman/over-8888-0565.patch
diff options
context:
space:
mode:
Diffstat (limited to 'recipes/xorg-lib/pixman/over-8888-0565.patch')
-rw-r--r--recipes/xorg-lib/pixman/over-8888-0565.patch296
1 files changed, 296 insertions, 0 deletions
diff --git a/recipes/xorg-lib/pixman/over-8888-0565.patch b/recipes/xorg-lib/pixman/over-8888-0565.patch
new file mode 100644
index 0000000000..3e27094022
--- /dev/null
+++ b/recipes/xorg-lib/pixman/over-8888-0565.patch
@@ -0,0 +1,296 @@
+From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Mon, 27 Jul 2009 04:48:04 +0000 (+0300)
+Subject: ARM: NEON optimized version of composite_over_8888_0565
+X-Git-Url: http://siarhei.siamashka.name/gitweb/?p=pixman.git;a=commitdiff_plain;h=17d8ab82858511f212dfb30c347255393eb12b0c
+
+ARM: NEON optimized version of composite_over_8888_0565
+---
+
+diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
+index 9404c70..f1dcf1f 100644
+--- a/pixman/pixman-arm-neon.c
++++ b/pixman/pixman-arm-neon.c
+@@ -1447,6 +1447,274 @@ neon_composite_src_16_16 (pixman_implementation_t * impl,
+ }
+ }
+
++static inline void
++neon_composite_over_8888_0565_internal (uint32_t *src,
++ uint16_t *dst,
++ int32_t w,
++ int32_t h,
++ int32_t src_stride,
++ int32_t dst_stride)
++{
++ int32_t dst_newline_delta = (dst_stride - w) * 2;
++ int32_t src_newline_delta = (src_stride - w) * 4;
++ asm volatile (
++
++ ".macro process_pixblock_head size\n"
++ /* load pixel data from memory */
++ " .if \\size == 8\n"
++ " vld1.32 {d0, d1, d2, d3}, [%[src]]!\n"
++ " vld1.16 {d4, d5}, [%[dst_r]]!\n"
++ " .elseif \\size == 4\n"
++ " vld1.32 {d0, d1}, [%[src]]!\n"
++ " vld1.16 {d4}, [%[dst_r]]!\n"
++ " .elseif \\size == 2\n"
++ " vld1.32 {d0}, [%[src]]!\n"
++ " vld1.16 {d4[0]}, [%[dst_r]]!\n"
++ " vld1.16 {d4[1]}, [%[dst_r]]!\n"
++ " .elseif \\size == 1\n"
++ " vld1.32 {d0[0]}, [%[src]]!\n"
++ " vld1.16 {d4[0]}, [%[dst_r]]!\n"
++ " .endif\n"
++ /* deinterleave and convert both source and destination
++ to "planar" 8-bit format */
++ " vshrn.u16 d16, q2, #8\n"
++ " vuzp.8 d0, d1\n"
++ " vshrn.u16 d17, q2, #3\n"
++ " vuzp.8 d2, d3\n"
++ " vsli.u16 q2, q2, #5\n"
++ " vuzp.8 d1, d3\n"
++ " vsri.u8 d16, d16, #5\n"
++ " vuzp.8 d0, d2\n"
++ " vmvn.8 d3, d3\n"
++ " vsri.u8 d17, d17, #6\n"
++ " vshrn.u16 d18, q2, #2\n"
++ /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
++ /* destination: d16 - red, d17 - green, d18 - blue */
++ /* now do alpha blending */
++ " vmull.u8 q10, d3, d16\n"
++ "pld [%[src], #128]\n"
++ " vmull.u8 q11, d3, d17\n"
++ "pld [%[dst_r], #64]\n"
++ " vmull.u8 q12, d3, d18\n"
++ " vrshr.u16 q13, q10, #8\n"
++ " vrshr.u16 q8, q11, #8\n"
++ " vrshr.u16 q9, q12, #8\n"
++ " vraddhn.u16 d20, q10, q13\n"
++ " vraddhn.u16 d21, q11, q8\n"
++ " vraddhn.u16 d22, q12, q9\n"
++ ".endm\n"
++
++ ".macro process_pixblock_tail size\n"
++ /* result is ready in d28, d29, d30 (R, G, B) */
++ " vqadd.u8 d28, d2, d20\n"
++ " vqadd.u8 d29, d1, d21\n"
++ " vqadd.u8 d30, d0, d22\n"
++ /* convert it to r5g6b5 */
++ " vshll.u8 q3, d28, #8\n"
++ " vshll.u8 q14, d29, #8\n"
++ " vshll.u8 q15, d30, #8\n"
++ " vsri.u16 q3, q14, #5\n"
++ " vsri.u16 q3, q15, #11\n"
++ /* store pixel data to memory */
++ " .if \\size == 8\n"
++ " vst1.16 {d6, d7}, [%[dst_w], :128]!\n"
++ " .elseif \\size == 4\n"
++ " vst1.16 {d6}, [%[dst_w]]!\n"
++ " .elseif \\size == 2\n"
++ " vst1.16 {d6[0]}, [%[dst_w]]!\n"
++ " vst1.16 {d6[1]}, [%[dst_w]]!\n"
++ " .elseif \\size == 1\n"
++ " vst1.16 {d6[0]}, [%[dst_w]]!\n"
++ " .endif\n"
++ ".endm\n"
++
++ /* "tail" of the previous block and "head" of the next block
++ are merged and interleaved for better instructions scheduling */
++ ".macro process_pixblock_tail_head_8\n"
++ " vqadd.u8 d28, d2, d20\n"
++ " vld1.16 {d4, d5}, [%[dst_r], :128]!\n"
++ " vqadd.u8 d29, d1, d21\n" /* TODO: try to join these into a */
++ " vqadd.u8 d30, d0, d22\n" /* single 128-bit operation */
++ " vshrn.u16 d16, q2, #8\n"
++ " vld1.32 {d0, d1, d2, d3}, [%[src]]!\n" /* TODO: maybe split */
++ " vshrn.u16 d17, q2, #3\n"
++ " vsli.u16 q2, q2, #5\n"
++ " vuzp.8 d0, d1\n"
++ " vshll.u8 q3, d28, #8\n"
++ " vuzp.8 d2, d3\n"
++ " vshll.u8 q14, d29, #8\n"
++ " vuzp.8 d1, d3\n"
++ " vsri.u8 d16, d16, #5\n"
++ " vuzp.8 d0, d2\n"
++ " vmvn.8 d3, d3\n"
++ " vsri.u8 d17, d17, #6\n"
++ " vshrn.u16 d18, q2, #2\n"
++ " vmull.u8 q10, d3, d16\n"
++ "pld [%[src], #128]\n"
++ " vmull.u8 q11, d3, d17\n"
++ "pld [%[dst_r], #64]\n"
++ " vmull.u8 q12, d3, d18\n"
++ " vsri.u16 d6, d28, #5\n"
++ " vsri.u16 d7, d29, #5\n"
++ " vshll.u8 q15, d30, #8\n"
++ " vrshr.u16 q13, q10, #8\n"
++ " vrshr.u16 q8, q11, #8\n"
++ " vrshr.u16 q9, q12, #8\n"
++ " vsri.u16 d6, d30, #11\n"
++ " vsri.u16 d7, d31, #11\n"
++ " vraddhn.u16 d20, q10, q13\n"
++ " vraddhn.u16 d21, q11, q8\n"
++ " vraddhn.u16 d22, q12, q9\n"
++ " vst1.16 {d6, d7}, [%[dst_w], :128]!\n"
++ ".endm\n"
++
++ "subs %[h], %[h], #1\n"
++ "blt 9f\n"
++ "0:\n"
++ "cmp %[w], #8\n"
++ "blt 8f\n"
++
++ /* ensure 16 byte alignment of the destination buffer */
++ "tst %[dst_r], #0xF\n"
++ "beq 2f\n"
++ "tst %[dst_r], #2\n"
++ "beq 1f\n"
++ "vld1.32 {d3[0]}, [%[src]]!\n"
++ "vld1.16 {d5[2]}, [%[dst_r]]!\n"
++ "sub %[w], %[w], #1\n"
++ "1:\n"
++ "tst %[dst_r], #4\n"
++ "beq 1f\n"
++ "vld1.32 {d2}, [%[src]]!\n"
++ "vld1.16 {d5[0]}, [%[dst_r]]!\n"
++ "vld1.16 {d5[1]}, [%[dst_r]]!\n"
++ "sub %[w], %[w], #2\n"
++ "1:\n"
++ "tst %[dst_r], #8\n"
++ "beq 1f\n"
++ "vld1.32 {d0, d1}, [%[src]]!\n"
++ "vld1.16 {d4}, [%[dst_r]]!\n"
++ "sub %[w], %[w], #4\n"
++ "1:\n"
++ "process_pixblock_head -1\n"
++ "process_pixblock_tail -1\n"
++ "tst %[dst_w], #2\n"
++ "beq 1f\n"
++ "vst1.16 {d7[2]}, [%[dst_w]]!\n"
++ "1:\n"
++ "tst %[dst_w], #4\n"
++ "beq 1f\n"
++ "vst1.16 {d7[0]}, [%[dst_w]]!\n"
++ "vst1.16 {d7[1]}, [%[dst_w]]!\n"
++ "1:\n"
++ "tst %[dst_w], #8\n"
++ "beq 2f\n"
++ "vst1.16 {d6}, [%[dst_w]]!\n"
++ "2:\n"
++
++ "subs %[w], %[w], #8\n"
++ "blt 8f\n"
++ "process_pixblock_head 8\n"
++ "subs %[w], %[w], #8\n"
++ "blt 2f\n"
++ "1:\n" /* innermost pipelined loop */
++ "process_pixblock_tail_head_8\n"
++ "subs %[w], %[w], #8\n"
++ "bge 1b\n"
++ "2:\n"
++ "process_pixblock_tail 8\n"
++
++ "8:\n"
++ /* process up to 7 remaining pixels */
++ "tst %[w], #7\n"
++ "beq 2f\n"
++ "tst %[w], #4\n"
++ "beq 1f\n"
++ "vld1.32 {d0, d1}, [%[src]]!\n"
++ "vld1.16 {d4}, [%[dst_r]]!\n"
++ "1:\n"
++ "tst %[w], #2\n"
++ "beq 1f\n"
++ "vld1.32 {d2}, [%[src]]!\n"
++ "vld1.16 {d5[0]}, [%[dst_r]]!\n"
++ "vld1.16 {d5[1]}, [%[dst_r]]!\n"
++ "1:\n"
++ "tst %[w], #1\n"
++ "beq 1f\n"
++ "vld1.32 {d3[0]}, [%[src]]!\n"
++ "vld1.16 {d5[2]}, [%[dst_r]]!\n"
++ "1:\n"
++
++ "process_pixblock_head -1\n"
++ "process_pixblock_tail -1\n"
++
++ "tst %[w], #4\n"
++ "beq 1f\n"
++ "vst1.16 {d6}, [%[dst_w]]!\n"
++ "1:\n"
++ "tst %[w], #2\n"
++ "beq 1f\n"
++ "vst1.16 {d7[0]}, [%[dst_w]]!\n"
++ "vst1.16 {d7[1]}, [%[dst_w]]!\n"
++ "1:\n"
++ "tst %[w], #1\n"
++ "beq 2f\n"
++ "vst1.16 {d7[2]}, [%[dst_w]]!\n"
++ "2:\n"
++
++ "add %[src], %[src], %[src_newline_delta]\n"
++ "add %[dst_r], %[dst_r], %[dst_newline_delta]\n"
++ "add %[dst_w], %[dst_w], %[dst_newline_delta]\n"
++ "mov %[w], %[orig_w]\n"
++ "subs %[h], %[h], #1\n"
++ "bge 0b\n"
++ "9:\n"
++ ".purgem process_pixblock_head\n"
++ ".purgem process_pixblock_tail\n"
++ ".purgem process_pixblock_tail_head_8\n"
++
++ : [src] "+&r" (src), [dst_r] "+&r" (dst), [dst_w] "+&r" (dst),
++ [w] "+&r" (w), [h] "+&r" (h)
++ : [dst_newline_delta] "r" (dst_newline_delta),
++ [src_newline_delta] "r" (src_newline_delta), [orig_w] "r" (w)
++ : "cc", "memory",
++ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
++ /* "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", */
++ "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
++ "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
++ );
++}
++
++static void
++neon_composite_over_8888_0565 (pixman_implementation_t *imp,
++ pixman_op_t op,
++ pixman_image_t * src_image,
++ pixman_image_t * mask_image,
++ pixman_image_t * dst_image,
++ int32_t src_x,
++ int32_t src_y,
++ int32_t mask_x,
++ int32_t mask_y,
++ int32_t dest_x,
++ int32_t dest_y,
++ int32_t width,
++ int32_t height)
++{
++ uint16_t *dst_line;
++ uint32_t *src_line;
++ int32_t dst_stride, src_stride;
++
++ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
++
++ neon_composite_over_8888_0565_internal (src_line,
++ dst_line,
++ width,
++ height,
++ src_stride,
++ dst_stride);
++}
++
+ #endif /* USE_GCC_INLINE_ASM */
+
+ static void
+@@ -1908,6 +2176,8 @@ static const pixman_fast_path_t arm_neon_fast_path_array[] =
+ #ifdef USE_GCC_INLINE_ASM
+ { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_16_16, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_16_16, 0 },
++ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_8888_0565, 0 },
++ { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_8888_0565, 0 },
+ #endif
+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_over_8888_8888, 0 },