diff options
Diffstat (limited to 'recipes/xorg-lib/pixman')
17 files changed, 0 insertions, 5847 deletions
diff --git a/recipes/xorg-lib/pixman/0001-Generic-C-implementation-of-pixman_blt-with-overlapp.patch b/recipes/xorg-lib/pixman/0001-Generic-C-implementation-of-pixman_blt-with-overlapp.patch deleted file mode 100644 index a2cda2438e..0000000000 --- a/recipes/xorg-lib/pixman/0001-Generic-C-implementation-of-pixman_blt-with-overlapp.patch +++ /dev/null @@ -1,114 +0,0 @@ -From 8ea1a333de202018a862a7b04b94479d3109274b Mon Sep 17 00:00:00 2001 -From: Siarhei Siamashka <siarhei.siamashka@nokia.com> -Date: Tue, 16 Mar 2010 16:55:28 +0100 -Subject: [PATCH 1/5] Generic C implementation of pixman_blt with overlapping support - -Uses memcpy/memmove functions to copy pixels, can handle the -case when both source and destination areas are in the same -image (this is useful for scrolling). - -It is assumed that copying direction is only important when -using the same image for both source and destination (and -src_stride == dst_stride). Copying direction is undefined -for the images with different source and destination stride -which happen to be in the overlapped areas (but this is an -unrealistic case anyway). ---- - pixman/pixman-general.c | 21 ++++++++++++++++++--- - pixman/pixman-private.h | 43 +++++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 61 insertions(+), 3 deletions(-) - -diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c -index bddf79a..f525744 100644 ---- a/pixman/pixman-general.c -+++ b/pixman/pixman-general.c -@@ -285,9 +285,24 @@ general_blt (pixman_implementation_t *imp, - int width, - int height) - { -- /* We can't blit unless we have sse2 or mmx */ -- -- return FALSE; -+ uint8_t *dst_bytes = (uint8_t *)dst_bits; -+ uint8_t *src_bytes = (uint8_t *)src_bits; -+ int bpp; -+ -+ if (src_bpp != dst_bpp || src_bpp & 7) -+ return FALSE; -+ -+ bpp = src_bpp >> 3; -+ width *= bpp; -+ src_stride *= 4; -+ dst_stride *= 4; -+ pixman_blt_helper (src_bytes + src_y * src_stride + src_x * bpp, -+ dst_bytes + dst_y * dst_stride + dst_x * bpp, -+ src_stride, -+ dst_stride, -+ width, -+ height); -+ return TRUE; - } - - static pixman_bool_t -diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h -index d5767af..eeb677d 100644 ---- a/pixman/pixman-private.h -+++ b/pixman/pixman-private.h -@@ -10,6 +10,7 @@ - - #include "pixman.h" - #include <time.h> -+#include <string.h> - #include <assert.h> - #include <stdio.h> - #include <string.h> -@@ -867,4 +868,46 @@ void pixman_timer_register (pixman_timer_t *timer); - - #endif /* PIXMAN_TIMERS */ - -+/* a helper function, can blit 8-bit images with src/dst overlapping support */ -+static inline void -+pixman_blt_helper (uint8_t *src_bytes, -+ uint8_t *dst_bytes, -+ int src_stride, -+ int dst_stride, -+ int width, -+ int height) -+{ -+ /* -+ * The second part of this check is not strictly needed, but it prevents -+ * unnecessary upside-down processing of areas which belong to different -+ * images. Upside-down processing can be slower with fixed-distance-ahead -+ * prefetch and perceived as having more tearing. -+ */ -+ if (src_bytes < dst_bytes + width && -+ src_bytes + src_stride * height > dst_bytes) -+ { -+ src_bytes += src_stride * height - src_stride; -+ dst_bytes += dst_stride * height - dst_stride; -+ dst_stride = -dst_stride; -+ src_stride = -src_stride; -+ /* Horizontal scrolling to the left needs memmove */ -+ if (src_bytes + width > dst_bytes) -+ { -+ while (--height >= 0) -+ { -+ memmove (dst_bytes, src_bytes, width); -+ dst_bytes += dst_stride; -+ src_bytes += src_stride; -+ } -+ return; -+ } -+ } -+ while (--height >= 0) -+ { -+ memcpy (dst_bytes, src_bytes, width); -+ dst_bytes += dst_stride; -+ src_bytes += src_stride; -+ } -+} -+ - #endif /* PIXMAN_PRIVATE_H */ --- -1.6.6.1 - diff --git a/recipes/xorg-lib/pixman/0002-Support-of-overlapping-src-dst-for-pixman_blt_mmx.patch b/recipes/xorg-lib/pixman/0002-Support-of-overlapping-src-dst-for-pixman_blt_mmx.patch deleted file mode 100644 index 003337f48d..0000000000 --- a/recipes/xorg-lib/pixman/0002-Support-of-overlapping-src-dst-for-pixman_blt_mmx.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 3170d9f5e927681a2516bcec52b317d1d4785e25 Mon Sep 17 00:00:00 2001 -From: Siarhei Siamashka <siarhei.siamashka@nokia.com> -Date: Thu, 22 Oct 2009 05:45:47 +0300 -Subject: [PATCH 2/5] Support of overlapping src/dst for pixman_blt_mmx - ---- - pixman/pixman-mmx.c | 55 +++++++++++++++++++++++++++++--------------------- - 1 files changed, 32 insertions(+), 23 deletions(-) - -diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c -index e084e7f..6212b31 100644 ---- a/pixman/pixman-mmx.c -+++ b/pixman/pixman-mmx.c -@@ -2994,34 +2994,43 @@ pixman_blt_mmx (uint32_t *src_bits, - { - uint8_t * src_bytes; - uint8_t * dst_bytes; -- int byte_width; -+ int bpp; - -- if (src_bpp != dst_bpp) -+ if (src_bpp != dst_bpp || src_bpp & 7) - return FALSE; - -- if (src_bpp == 16) -- { -- src_stride = src_stride * (int) sizeof (uint32_t) / 2; -- dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; -- src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); -- dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); -- byte_width = 2 * width; -- src_stride *= 2; -- dst_stride *= 2; -- } -- else if (src_bpp == 32) -+ bpp = src_bpp >> 3; -+ width *= bpp; -+ src_stride *= 4; -+ dst_stride *= 4; -+ src_bytes = (uint8_t *)src_bits + src_y * src_stride + src_x * bpp; -+ dst_bytes = (uint8_t *)dst_bits + dst_y * dst_stride + dst_x * bpp; -+ -+ if (src_bpp != 16 && src_bpp != 32) - { -- src_stride = src_stride * (int) sizeof (uint32_t) / 4; -- dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; -- src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); -- dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); -- byte_width = 4 * width; -- src_stride *= 4; -- dst_stride *= 4; -+ pixman_blt_helper (src_bytes, dst_bytes, src_stride, dst_stride, -+ width, height); -+ return TRUE; - } -- else -+ -+ if (src_bytes < dst_bytes && src_bytes + src_stride * height > dst_bytes) - { -- return FALSE; -+ src_bytes += src_stride * height - src_stride; -+ dst_bytes += dst_stride * height - dst_stride; -+ dst_stride = -dst_stride; -+ src_stride = -src_stride; -+ -+ if (src_bytes + width > dst_bytes) -+ { -+ /* TODO: reverse scanline copy using MMX */ -+ while (--height >= 0) -+ { -+ memmove (dst_bytes, src_bytes, width); -+ dst_bytes += dst_stride; -+ src_bytes += src_stride; -+ } -+ return TRUE; -+ } - } - - while (height--) -@@ -3031,7 +3040,7 @@ pixman_blt_mmx (uint32_t *src_bits, - uint8_t *d = dst_bytes; - src_bytes += src_stride; - dst_bytes += dst_stride; -- w = byte_width; -+ w = width; - - while (w >= 2 && ((unsigned long)d & 3)) - { --- -1.6.6.1 - diff --git a/recipes/xorg-lib/pixman/0003-Support-of-overlapping-src-dst-for-pixman_blt_sse2.patch b/recipes/xorg-lib/pixman/0003-Support-of-overlapping-src-dst-for-pixman_blt_sse2.patch deleted file mode 100644 index 7e8f34f6bd..0000000000 --- a/recipes/xorg-lib/pixman/0003-Support-of-overlapping-src-dst-for-pixman_blt_sse2.patch +++ /dev/null @@ -1,91 +0,0 @@ -From f07cd58c643b490dcb1ef7be2642926cfeca1e69 Mon Sep 17 00:00:00 2001 -From: Siarhei Siamashka <siarhei.siamashka@nokia.com> -Date: Thu, 22 Oct 2009 05:45:54 +0300 -Subject: [PATCH 3/5] Support of overlapping src/dst for pixman_blt_sse2 - ---- - pixman/pixman-sse2.c | 55 +++++++++++++++++++++++++++++-------------------- - 1 files changed, 32 insertions(+), 23 deletions(-) - -diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c -index 946e7ba..66053ae 100644 ---- a/pixman/pixman-sse2.c -+++ b/pixman/pixman-sse2.c -@@ -5299,34 +5299,43 @@ pixman_blt_sse2 (uint32_t *src_bits, - { - uint8_t * src_bytes; - uint8_t * dst_bytes; -- int byte_width; -+ int bpp; - -- if (src_bpp != dst_bpp) -+ if (src_bpp != dst_bpp || src_bpp & 7) - return FALSE; - -- if (src_bpp == 16) -- { -- src_stride = src_stride * (int) sizeof (uint32_t) / 2; -- dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; -- src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); -- dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); -- byte_width = 2 * width; -- src_stride *= 2; -- dst_stride *= 2; -- } -- else if (src_bpp == 32) -+ bpp = src_bpp >> 3; -+ width *= bpp; -+ src_stride *= 4; -+ dst_stride *= 4; -+ src_bytes = (uint8_t *)src_bits + src_y * src_stride + src_x * bpp; -+ dst_bytes = (uint8_t *)dst_bits + dst_y * dst_stride + dst_x * bpp; -+ -+ if (src_bpp != 16 && src_bpp != 32) - { -- src_stride = src_stride * (int) sizeof (uint32_t) / 4; -- dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; -- src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); -- dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); -- byte_width = 4 * width; -- src_stride *= 4; -- dst_stride *= 4; -+ pixman_blt_helper (src_bytes, dst_bytes, src_stride, dst_stride, -+ width, height); -+ return TRUE; - } -- else -+ -+ if (src_bytes < dst_bytes && src_bytes + src_stride * height > dst_bytes) - { -- return FALSE; -+ src_bytes += src_stride * height - src_stride; -+ dst_bytes += dst_stride * height - dst_stride; -+ dst_stride = -dst_stride; -+ src_stride = -src_stride; -+ -+ if (src_bytes + width > dst_bytes) -+ { -+ /* TODO: reverse scanline copy using SSE2 */ -+ while (--height >= 0) -+ { -+ memmove (dst_bytes, src_bytes, width); -+ dst_bytes += dst_stride; -+ src_bytes += src_stride; -+ } -+ return TRUE; -+ } - } - - cache_prefetch ((__m128i*)src_bytes); -@@ -5339,7 +5348,7 @@ pixman_blt_sse2 (uint32_t *src_bits, - uint8_t *d = dst_bytes; - src_bytes += src_stride; - dst_bytes += dst_stride; -- w = byte_width; -+ w = width; - - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); --- -1.6.6.1 - diff --git a/recipes/xorg-lib/pixman/0004-Support-of-overlapping-src-dst-for-pixman_blt_neon.patch b/recipes/xorg-lib/pixman/0004-Support-of-overlapping-src-dst-for-pixman_blt_neon.patch deleted file mode 100644 index 0ba5b843b5..0000000000 --- a/recipes/xorg-lib/pixman/0004-Support-of-overlapping-src-dst-for-pixman_blt_neon.patch +++ /dev/null @@ -1,94 +0,0 @@ -From e0542866c466ad512d69292df098d4b880e35e52 Mon Sep 17 00:00:00 2001 -From: Siarhei Siamashka <siarhei.siamashka@nokia.com> -Date: Wed, 18 Nov 2009 06:08:48 +0200 -Subject: [PATCH 4/5] Support of overlapping src/dst for pixman_blt_neon - ---- - pixman/pixman-arm-neon.c | 62 +++++++++++++++++++++++++++++++++++++-------- - 1 files changed, 51 insertions(+), 11 deletions(-) - -diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c -index 24ceeeb..134493d 100644 ---- a/pixman/pixman-arm-neon.c -+++ b/pixman/pixman-arm-neon.c -@@ -360,26 +360,66 @@ pixman_blt_neon (uint32_t *src_bits, - int width, - int height) - { -- if (src_bpp != dst_bpp) -+ uint8_t * src_bytes; -+ uint8_t * dst_bytes; -+ int bpp; -+ -+ if (src_bpp != dst_bpp || src_bpp & 7) - return FALSE; - -+ bpp = src_bpp >> 3; -+ width *= bpp; -+ src_stride *= 4; -+ dst_stride *= 4; -+ src_bytes = (uint8_t *)src_bits + src_y * src_stride + src_x * bpp; -+ dst_bytes = (uint8_t *)dst_bits + dst_y * dst_stride + dst_x * bpp; -+ -+ if (src_bpp != 16 && src_bpp != 32) -+ { -+ pixman_blt_helper (src_bytes, dst_bytes, src_stride, dst_stride, -+ width, height); -+ return TRUE; -+ } -+ -+ if (src_bytes < dst_bytes && src_bytes + src_stride * height > dst_bytes) -+ { -+ src_bytes += src_stride * height - src_stride; -+ dst_bytes += dst_stride * height - dst_stride; -+ dst_stride = -dst_stride; -+ src_stride = -src_stride; -+ -+ if (src_bytes + width > dst_bytes) -+ { -+ /* TODO: reverse scanline copy using NEON */ -+ while (--height >= 0) -+ { -+ memmove (dst_bytes, src_bytes, width); -+ dst_bytes += dst_stride; -+ src_bytes += src_stride; -+ } -+ return TRUE; -+ } -+ } -+ - switch (src_bpp) - { - case 16: - pixman_composite_src_0565_0565_asm_neon ( -- width, height, -- (uint16_t *)(((char *) dst_bits) + -- dst_y * dst_stride * 4 + dst_x * 2), dst_stride * 2, -- (uint16_t *)(((char *) src_bits) + -- src_y * src_stride * 4 + src_x * 2), src_stride * 2); -+ width >> 1, -+ height, -+ (uint16_t *) dst_bytes, -+ dst_stride >> 1, -+ (uint16_t *) src_bytes, -+ src_stride >> 1); - return TRUE; - case 32: - pixman_composite_src_8888_8888_asm_neon ( -- width, height, -- (uint32_t *)(((char *) dst_bits) + -- dst_y * dst_stride * 4 + dst_x * 4), dst_stride, -- (uint32_t *)(((char *) src_bits) + -- src_y * src_stride * 4 + src_x * 4), src_stride); -+ width >> 2, -+ height, -+ (uint32_t *) dst_bytes, -+ dst_stride >> 2, -+ (uint32_t *) src_bytes, -+ src_stride >> 2); - return TRUE; - default: - return FALSE; --- -1.6.6.1 - diff --git a/recipes/xorg-lib/pixman/0005-ARM-added-NEON-optimizations-for-fetch-store-r5g6b5-.patch b/recipes/xorg-lib/pixman/0005-ARM-added-NEON-optimizations-for-fetch-store-r5g6b5-.patch deleted file mode 100644 index 769ed2e7d0..0000000000 --- a/recipes/xorg-lib/pixman/0005-ARM-added-NEON-optimizations-for-fetch-store-r5g6b5-.patch +++ /dev/null @@ -1,169 +0,0 @@ -From d51b10a2750d99543a0c92ca44802aa7a4d70e54 Mon Sep 17 00:00:00 2001 -From: Siarhei Siamashka <siarhei.siamashka@nokia.com> -Date: Thu, 10 Dec 2009 00:51:50 +0200 -Subject: [PATCH 5/5] ARM: added NEON optimizations for fetch/store r5g6b5 scanline - ---- - pixman/pixman-access.c | 23 ++++++++++++++++++++++- - pixman/pixman-arm-neon-asm.S | 20 ++++++++++++++++++++ - pixman/pixman-arm-neon.c | 41 +++++++++++++++++++++++++++++++++++++++++ - pixman/pixman-private.h | 5 +++++ - 4 files changed, 88 insertions(+), 1 deletions(-) - -diff --git a/pixman/pixman-access.c b/pixman/pixman-access.c -index fa0a267..5bb3e09 100644 ---- a/pixman/pixman-access.c -+++ b/pixman/pixman-access.c -@@ -2748,7 +2748,7 @@ typedef struct - store_scanline_ ## format, store_scanline_generic_64 \ - } - --static const format_info_t accessors[] = -+static format_info_t accessors[] = - { - /* 32 bpp formats */ - FORMAT_INFO (a8r8g8b8), -@@ -2891,6 +2891,27 @@ _pixman_bits_image_setup_raw_accessors (bits_image_t *image) - setup_accessors (image); - } - -+void -+_pixman_bits_override_accessors (pixman_format_code_t format, -+ fetch_scanline_t fetch_func, -+ store_scanline_t store_func) -+{ -+ format_info_t *info = accessors; -+ -+ while (info->format != PIXMAN_null) -+ { -+ if (info->format == format) -+ { -+ if (fetch_func) -+ info->fetch_scanline_raw_32 = fetch_func; -+ if (store_func) -+ info->store_scanline_raw_32 = store_func; -+ return; -+ } -+ info++; -+ } -+} -+ - #else - - void -diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S -index eb8cc4c..6ab3301 100644 ---- a/pixman/pixman-arm-neon-asm.S -+++ b/pixman/pixman-arm-neon-asm.S -@@ -454,6 +454,16 @@ generate_composite_function \ - pixman_composite_src_8888_0565_process_pixblock_tail, \ - pixman_composite_src_8888_0565_process_pixblock_tail_head - -+generate_composite_function_single_scanline \ -+ pixman_store_scanline_r5g6b5_asm_neon, 32, 0, 16, \ -+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ -+ 8, /* number of pixels, processed in a single block */ \ -+ default_init, \ -+ default_cleanup, \ -+ pixman_composite_src_8888_0565_process_pixblock_head, \ -+ pixman_composite_src_8888_0565_process_pixblock_tail, \ -+ pixman_composite_src_8888_0565_process_pixblock_tail_head -+ - /******************************************************************************/ - - .macro pixman_composite_src_0565_8888_process_pixblock_head -@@ -489,6 +499,16 @@ generate_composite_function \ - pixman_composite_src_0565_8888_process_pixblock_tail, \ - pixman_composite_src_0565_8888_process_pixblock_tail_head - -+generate_composite_function_single_scanline \ -+ pixman_fetch_scanline_r5g6b5_asm_neon, 16, 0, 32, \ -+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ -+ 8, /* number of pixels, processed in a single block */ \ -+ default_init, \ -+ default_cleanup, \ -+ pixman_composite_src_0565_8888_process_pixblock_head, \ -+ pixman_composite_src_0565_8888_process_pixblock_tail, \ -+ pixman_composite_src_0565_8888_process_pixblock_tail_head -+ - /******************************************************************************/ - - .macro pixman_composite_add_8000_8000_process_pixblock_head -diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c -index 134493d..2245b52 100644 ---- a/pixman/pixman-arm-neon.c -+++ b/pixman/pixman-arm-neon.c -@@ -567,6 +567,43 @@ neon_combine_##name##_u (pixman_implementation_t *imp, \ - BIND_COMBINE_U (over) - BIND_COMBINE_U (add) - -+void -+pixman_fetch_scanline_r5g6b5_asm_neon (int width, -+ uint32_t *buffer, -+ const uint16_t *pixel); -+void -+pixman_store_scanline_r5g6b5_asm_neon (int width, -+ uint16_t *pixel, -+ const uint32_t *values); -+ -+static void -+neon_fetch_scanline_r5g6b5 (pixman_image_t *image, -+ int x, -+ int y, -+ int width, -+ uint32_t * buffer, -+ const uint32_t *mask, -+ uint32_t mask_bits) -+{ -+ const uint32_t *bits = image->bits.bits + y * image->bits.rowstride; -+ const uint16_t *pixel = (const uint16_t *)bits + x; -+ -+ pixman_fetch_scanline_r5g6b5_asm_neon (width, buffer, pixel); -+} -+ -+static void -+neon_store_scanline_r5g6b5 (bits_image_t * image, -+ int x, -+ int y, -+ int width, -+ const uint32_t *values) -+{ -+ uint32_t *bits = image->bits + image->rowstride * y; -+ uint16_t *pixel = ((uint16_t *) bits) + x; -+ -+ pixman_store_scanline_r5g6b5_asm_neon (width, pixel, values); -+} -+ - pixman_implementation_t * - _pixman_implementation_create_arm_neon (void) - { -@@ -577,6 +614,10 @@ _pixman_implementation_create_arm_neon (void) - imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u; - imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u; - -+ _pixman_bits_override_accessors (PIXMAN_r5g6b5, -+ neon_fetch_scanline_r5g6b5, -+ neon_store_scanline_r5g6b5); -+ - imp->blt = arm_neon_blt; - imp->fill = arm_neon_fill; - -diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h -index eeb677d..ba2d401 100644 ---- a/pixman/pixman-private.h -+++ b/pixman/pixman-private.h -@@ -220,6 +220,11 @@ void - _pixman_bits_image_setup_raw_accessors (bits_image_t *image); - - void -+_pixman_bits_override_accessors (pixman_format_code_t format, -+ fetch_scanline_t fetch_func, -+ store_scanline_t store_func); -+ -+void - _pixman_image_get_scanline_generic_64 (pixman_image_t *image, - int x, - int y, --- -1.6.6.1 - diff --git a/recipes/xorg-lib/pixman/0006-Revert-ARM-SIMD-Try-without-any-CFLAGS-before-forcin.patch b/recipes/xorg-lib/pixman/0006-Revert-ARM-SIMD-Try-without-any-CFLAGS-before-forcin.patch deleted file mode 100644 index 3d8d4e8292..0000000000 --- a/recipes/xorg-lib/pixman/0006-Revert-ARM-SIMD-Try-without-any-CFLAGS-before-forcin.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 7f0adaef68c5b0bb1c5eb9f5db5792b71b8b8beb Mon Sep 17 00:00:00 2001 -From: Koen Kooi <koen@dominion.thruhere.net> -Date: Fri, 19 Mar 2010 10:44:09 +0100 -Subject: [PATCH 6/6] Revert "ARM: SIMD: Try without any CFLAGS before forcing -mcpu=" - -This forces -marm that results in runtime SIGILL on thumb userspace - -This reverts commit 18f0de452dc7e12e4cb544d761a626d5c6031663. ---- - configure.ac | 20 +++++--------------- - 1 files changed, 5 insertions(+), 15 deletions(-) - -diff --git a/configure.ac b/configure.ac -index fc3ee24..f84a4dc 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -363,28 +363,18 @@ AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes) - - dnl =========================================================================== - dnl Check for ARM SIMD instructions --ARM_SIMD_CFLAGS="" -+ARM_SIMD_CFLAGS="-mcpu=arm1136j-s" - - have_arm_simd=no - AC_MSG_CHECKING(whether to use ARM SIMD assembler) --# check with default CFLAGS in case the toolchain turns on a sufficiently recent -mcpu= -+xserver_save_CFLAGS=$CFLAGS -+CFLAGS="$ARM_SIMD_CFLAGS $CFLAGS" - AC_COMPILE_IFELSE([ - int main () { - asm("uqadd8 r1, r1, r2"); - return 0; --}], have_arm_simd=yes, -- # check again with an explicit -mcpu= in case the toolchain defaults to an -- # older one; note that uqadd8 isn't available in Thumb mode on arm1136j-s -- # so we force ARM mode -- ARM_SIMD_CFLAGS="-mcpu=arm1136j-s -marm" -- xserver_save_CFLAGS=$CFLAGS -- CFLAGS="$ARM_SIMD_CFLAGS $CFLAGS" -- AC_COMPILE_IFELSE([ -- int main () { -- asm("uqadd8 r1, r1, r2"); -- return 0; -- }], have_arm_simd=yes) -- CFLAGS=$xserver_save_CFLAGS) -+}], have_arm_simd=yes) -+CFLAGS=$xserver_save_CFLAGS - - AC_ARG_ENABLE(arm-simd, - [AC_HELP_STRING([--disable-arm-simd], --- -1.6.6.1 - diff --git a/recipes/xorg-lib/pixman/calloc.patch b/recipes/xorg-lib/pixman/calloc.patch deleted file mode 100644 index 4a60d7ef9a..0000000000 --- a/recipes/xorg-lib/pixman/calloc.patch +++ /dev/null @@ -1,23 +0,0 @@ -From 634ba33b5b1fcfd5a0e7910f9991b4ed4f674549 Mon Sep 17 00:00:00 2001 -From: Søren Sandmann Pedersen <ssp@redhat.com> -Date: Wed, 07 Apr 2010 05:39:14 +0000 -Subject: Fix uninitialized cache when pthreads are used - -The thread local cache is allocated with malloc(), but we rely on it -being initialized to zero, so allocate it with calloc() instead. ---- -diff --git a/pixman/pixman-compiler.h b/pixman/pixman-compiler.h -index a4e3f88..cdac0d8 100644 ---- a/pixman/pixman-compiler.h -+++ b/pixman/pixman-compiler.h -@@ -101,7 +101,7 @@ - static type * \ - tls_ ## name ## _alloc (key) \ - { \ -- type *value = malloc (sizeof (type)); \ -+ type *value = calloc (1, sizeof (type)); \ - if (value) \ - pthread_setspecific (key, value); \ - return value; \ --- -cgit v0.8.3-6-g21f6 diff --git a/recipes/xorg-lib/pixman/nearest-neighbour.patch b/recipes/xorg-lib/pixman/nearest-neighbour.patch deleted file mode 100644 index 29b140faf9..0000000000 --- a/recipes/xorg-lib/pixman/nearest-neighbour.patch +++ /dev/null @@ -1,1040 +0,0 @@ -From: Siarhei Siamashka <siarhei.siamashka@nokia.com> -Date: Fri, 17 Jul 2009 10:22:23 +0000 (+0300) -Subject: Fastpath for nearest neighbour scaled compositing operations. -X-Git-Url: http://siarhei.siamashka.name/gitweb/?p=pixman.git;a=commitdiff_plain;h=247531c6978725a88fd3706129b9d3e339026f54 - -Fastpath for nearest neighbour scaled compositing operations. - -OVER 8888x8888, OVER 8888x0565, SRC 8888x8888, SRC 8888x0565 -and SRC 0565x0565 cases are supported. ---- - -diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c -index 7f80578..7f3a6ad 100644 ---- a/pixman/pixman-fast-path.c -+++ b/pixman/pixman-fast-path.c -@@ -1261,6 +1261,993 @@ fast_composite_src_scale_nearest (pixman_implementation_t *imp, - } - } - -+/* -+ * Functions, which implement the core inner loops for the nearest neighbour -+ * scaled fastpath compositing operations. The do not need to do clipping -+ * checks, also the loops are unrolled to process two pixels per iteration -+ * for better performance on most CPU architectures (superscalar processors -+ * can issue several operations simultaneously, other processors can hide -+ * instructions latencies by pipelining operations). Unrolling more -+ * does not make much sense because the compiler will start running out -+ * of spare registers soon. -+ */ -+ -+#undef READ -+#undef WRITE -+#define READ(img,x) (*(x)) -+#define WRITE(img,ptr,v) ((*(ptr)) = (v)) -+ -+#define UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(x, a, y) do { \ -+ UN8x4_MUL_UN8_ADD_UN8x4(x, a, y); \ -+ x = CONVERT_8888_TO_0565(x); \ -+ } while (0) -+ -+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipOver_8888x0565 ( -+ pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst, -+ int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y) -+{ -+ uint16_t *dstLine; -+ uint32_t *srcFirstLine; -+ uint32_t d; -+ uint32_t s1, s2; -+ uint8_t a1, a2; -+ int w; -+ int x1, x2, y; -+ int32_t orig_vx = vx; -+ -+ uint32_t *src; -+ uint16_t *dst; -+ int srcStride, dstStride; -+ PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); -+ /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be -+ * transformed from destination space to source space */ -+ PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1); -+ -+ while (--height >= 0) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ -+ y = vy >> 16; -+ vy += unit_y; -+ -+ if ((y < 0) || (y >= pSrc->bits.height)) { -+ continue; -+ } -+ -+ src = srcFirstLine + srcStride * y; -+ -+ w = width; -+ vx = orig_vx; -+ while ((w -= 2) >= 0) -+ { -+ x1 = vx >> 16; -+ vx += unit_x; -+ s1 = READ(pSrc, src + x1); -+ -+ x2 = vx >> 16; -+ vx += unit_x; -+ s2 = READ(pSrc, src + x2); -+ -+ a1 = s1 >> 24; -+ a2 = s2 >> 24; -+ -+ if (a1 == 0xff) -+ WRITE(pDst, dst, CONVERT_8888_TO_0565(s1)); -+ else if (s1) { -+ d = CONVERT_0565_TO_0888(READ(pDst, dst)); -+ a1 ^= 0xff; -+ UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(d, a1, s1); -+ WRITE(pDst, dst, d); -+ } -+ dst++; -+ -+ if (a2 == 0xff) -+ WRITE(pDst, dst, CONVERT_8888_TO_0565(s2)); -+ else if (s2) { -+ d = CONVERT_0565_TO_0888(READ(pDst, dst)); -+ a2 ^= 0xff; -+ UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(d, a2, s2); -+ WRITE(pDst, dst, d); -+ } -+ dst++; -+ } -+ if (w & 1) { -+ x1 = vx >> 16; -+ vx += unit_x; -+ s1 = READ(pSrc, src + x1); -+ -+ a1 = s1 >> 24; -+ if (a1 == 0xff) -+ WRITE(pDst, dst, CONVERT_8888_TO_0565(s1)); -+ else if (s1) { -+ d = CONVERT_0565_TO_0888(READ(pDst, dst)); -+ a1 ^= 0xff; -+ UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(d, a1, s1); -+ WRITE(pDst, dst, d); -+ } -+ dst++; -+ } -+ } -+} -+ -+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatOver_8888x0565 ( -+ pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst, -+ int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y) -+{ -+ uint16_t *dstLine; -+ uint32_t *srcFirstLine; -+ uint32_t d; -+ uint32_t s1, s2; -+ uint8_t a1, a2; -+ int w; -+ int x1, x2, y; -+ int32_t orig_vx = vx; -+ int32_t max_vx, max_vy; -+ -+ uint32_t *src; -+ uint16_t *dst; -+ int srcStride, dstStride; -+ PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); -+ /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be -+ * transformed from destination space to source space */ -+ PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1); -+ -+ max_vx = pSrc->bits.width << 16; -+ max_vy = pSrc->bits.height << 16; -+ -+ while (orig_vx < 0) orig_vx += max_vx; -+ while (vy < 0) vy += max_vy; -+ while (orig_vx >= max_vx) orig_vx -= max_vx; -+ while (vy >= max_vy) vy -= max_vy; -+ -+ while (--height >= 0) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ -+ y = vy >> 16; -+ vy += unit_y; -+ while (vy >= max_vy) vy -= max_vy; -+ -+ src = srcFirstLine + srcStride * y; -+ -+ w = width; -+ vx = orig_vx; -+ while ((w -= 2) >= 0) -+ { -+ x1 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s1 = READ(pSrc, src + x1); -+ -+ x2 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s2 = READ(pSrc, src + x2); -+ -+ a1 = s1 >> 24; -+ a2 = s2 >> 24; -+ -+ if (a1 == 0xff) -+ WRITE(pDst, dst, CONVERT_8888_TO_0565(s1)); -+ else if (s1) { -+ d = CONVERT_0565_TO_0888(READ(pDst, dst)); -+ a1 ^= 0xff; -+ UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(d, a1, s1); -+ WRITE(pDst, dst, d); -+ } -+ dst++; -+ -+ if (a2 == 0xff) -+ WRITE(pDst, dst, CONVERT_8888_TO_0565(s2)); -+ else if (s2) { -+ d = CONVERT_0565_TO_0888(READ(pDst, dst)); -+ a2 ^= 0xff; -+ UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(d, a2, s2); -+ WRITE(pDst, dst, d); -+ } -+ dst++; -+ } -+ if (w & 1) { -+ x1 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s1 = READ(pSrc, src + x1); -+ -+ a1 = s1 >> 24; -+ if (a1 == 0xff) -+ WRITE(pDst, dst, CONVERT_8888_TO_0565(s1)); -+ else if (s1) { -+ d = CONVERT_0565_TO_0888(READ(pDst, dst)); -+ a1 ^= 0xff; -+ UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(d, a1, s1); -+ WRITE(pDst, dst, d); -+ } -+ dst++; -+ } -+ } -+} -+ -+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipOver_8888x8888 ( -+ pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst, -+ int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y) -+{ -+ uint32_t *dstLine; -+ uint32_t *srcFirstLine; -+ uint32_t d; -+ uint32_t s1, s2; -+ uint8_t a1, a2; -+ int w; -+ int x1, x2, y; -+ int32_t orig_vx = vx; -+ -+ uint32_t *src, *dst; -+ int srcStride, dstStride; -+ PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); -+ /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be -+ * transformed from destination space to source space */ -+ PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1); -+ -+ while (--height >= 0) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ -+ y = vy >> 16; -+ vy += unit_y; -+ -+ if ((y < 0) || (y >= pSrc->bits.height)) { -+ continue; -+ } -+ -+ src = srcFirstLine + srcStride * y; -+ -+ w = width; -+ vx = orig_vx; -+ while ((w -= 2) >= 0) -+ { -+ x1 = vx >> 16; -+ vx += unit_x; -+ s1 = READ(pSrc, src + x1); -+ -+ x2 = vx >> 16; -+ vx += unit_x; -+ s2 = READ(pSrc, src + x2); -+ -+ a1 = s1 >> 24; -+ a2 = s2 >> 24; -+ -+ if (a1 == 0xff) -+ WRITE(pDst, dst, s1); -+ else if (s1) { -+ d = READ(pDst, dst); -+ a1 ^= 0xff; -+ UN8x4_MUL_UN8_ADD_UN8x4(d, a1, s1); -+ WRITE(pDst, dst, d); -+ } -+ dst++; -+ -+ if (a2 == 0xff) -+ WRITE(pDst, dst, s2); -+ else if (s2) { -+ d = READ(pDst, dst); -+ a2 ^= 0xff; -+ UN8x4_MUL_UN8_ADD_UN8x4(d, a2, s2); -+ WRITE(pDst, dst, d); -+ } -+ dst++; -+ } -+ if (w & 1) { -+ x1 = vx >> 16; -+ vx += unit_x; -+ s1 = READ(pSrc, src + x1); -+ -+ a1 = s1 >> 24; -+ if (a1 == 0xff) -+ WRITE(pDst, dst, s1); -+ else if (s1) { -+ d = READ(pDst, dst); -+ a1 ^= 0xff; -+ UN8x4_MUL_UN8_ADD_UN8x4(d, a1, s1); -+ WRITE(pDst, dst, d); -+ } -+ dst++; -+ } -+ } -+} -+ -+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatOver_8888x8888 ( -+ pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst, -+ int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y) -+{ -+ uint32_t *dstLine; -+ uint32_t *srcFirstLine; -+ uint32_t d; -+ uint32_t s1, s2; -+ uint8_t a1, a2; -+ int w; -+ int x1, x2, y; -+ int32_t orig_vx = vx; -+ int32_t max_vx, max_vy; -+ -+ uint32_t *src, *dst; -+ int srcStride, dstStride; -+ PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); -+ /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be -+ * transformed from destination space to source space */ -+ PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1); -+ -+ max_vx = pSrc->bits.width << 16; -+ max_vy = pSrc->bits.height << 16; -+ -+ while (orig_vx < 0) orig_vx += max_vx; -+ while (vy < 0) vy += max_vy; -+ while (orig_vx >= max_vx) orig_vx -= max_vx; -+ while (vy >= max_vy) vy -= max_vy; -+ -+ while (--height >= 0) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ -+ y = vy >> 16; -+ vy += unit_y; -+ while (vy >= max_vy) vy -= max_vy; -+ -+ src = srcFirstLine + srcStride * y; -+ -+ w = width; -+ vx = orig_vx; -+ while ((w -= 2) >= 0) -+ { -+ x1 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s1 = READ(pSrc, src + x1); -+ -+ x2 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s2 = READ(pSrc, src + x2); -+ -+ a1 = s1 >> 24; -+ a2 = s2 >> 24; -+ -+ if (a1 == 0xff) -+ WRITE(pDst, dst, s1); -+ else if (s1) { -+ d = READ(pDst, dst); -+ a1 ^= 0xff; -+ UN8x4_MUL_UN8_ADD_UN8x4(d, a1, s1); -+ WRITE(pDst, dst, d); -+ } -+ dst++; -+ -+ if (a2 == 0xff) -+ WRITE(pDst, dst, s2); -+ else if (s2) { -+ d = READ(pDst, dst); -+ a2 ^= 0xff; -+ UN8x4_MUL_UN8_ADD_UN8x4(d, a2, s2); -+ WRITE(pDst, dst, d); -+ } -+ dst++; -+ } -+ if (w & 1) { -+ x1 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s1 = READ(pSrc, src + x1); -+ -+ a1 = s1 >> 24; -+ if (a1 == 0xff) -+ WRITE(pDst, dst, s1); -+ else if (s1) { -+ d = READ(pDst, dst); -+ a1 ^= 0xff; -+ UN8x4_MUL_UN8_ADD_UN8x4(d, a1, s1); -+ WRITE(pDst, dst, d); -+ } -+ dst++; -+ } -+ } -+} -+ -+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipSrc_8888x8888 ( -+ pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst, -+ int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y) -+{ -+ uint32_t *dstLine; -+ uint32_t *srcFirstLine; -+ uint32_t s1, s2; -+ int w; -+ int x1, x2, y; -+ int32_t orig_vx = vx; -+ -+ uint32_t *src, *dst; -+ int srcStride, dstStride; -+ PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); -+ /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be -+ * transformed from destination space to source space */ -+ PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1); -+ -+ while (--height >= 0) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ -+ y = vy >> 16; -+ vy += unit_y; -+ -+ if ((y < 0) || (y >= pSrc->bits.height)) { -+ memset(dst, 0, width * sizeof(*dst)); -+ continue; -+ } -+ -+ src = srcFirstLine + srcStride * y; -+ -+ w = width; -+ vx = orig_vx; -+ while ((w -= 2) >= 0) -+ { -+ x1 = vx >> 16; -+ vx += unit_x; -+ s1 = READ(pSrc, src + x1); -+ -+ x2 = vx >> 16; -+ vx += unit_x; -+ s2 = READ(pSrc, src + x2); -+ -+ WRITE(pDst, dst, s1); -+ dst++; -+ WRITE(pDst, dst, s2); -+ dst++; -+ } -+ if (w & 1) { -+ x1 = vx >> 16; -+ vx += unit_x; -+ s1 = READ(pSrc, src + x1); -+ WRITE(pDst, dst, s1); -+ dst++; -+ } -+ } -+} -+ -+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatSrc_8888x8888 ( -+ pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst, -+ int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y) -+{ -+ uint32_t *dstLine; -+ uint32_t *srcFirstLine; -+ uint32_t s1, s2; -+ int w; -+ int x1, x2, y; -+ int32_t orig_vx = vx; -+ int32_t max_vx, max_vy; -+ -+ uint32_t *src, *dst; -+ int srcStride, dstStride; -+ PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); -+ /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be -+ * transformed from destination space to source space */ -+ PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1); -+ -+ max_vx = pSrc->bits.width << 16; -+ max_vy = pSrc->bits.height << 16; -+ -+ while (orig_vx < 0) orig_vx += max_vx; -+ while (vy < 0) vy += max_vy; -+ while (orig_vx >= max_vx) orig_vx -= max_vx; -+ while (vy >= max_vy) vy -= max_vy; -+ -+ while (--height >= 0) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ -+ y = vy >> 16; -+ vy += unit_y; -+ while (vy >= max_vy) vy -= max_vy; -+ -+ src = srcFirstLine + srcStride * y; -+ -+ w = width; -+ vx = orig_vx; -+ while ((w -= 2) >= 0) -+ { -+ x1 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s1 = READ(pSrc, src + x1); -+ -+ x2 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s2 = READ(pSrc, src + x2); -+ -+ WRITE(pDst, dst, s1); -+ dst++; -+ WRITE(pDst, dst, s2); -+ dst++; -+ } -+ if (w & 1) { -+ x1 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s1 = READ(pSrc, src + x1); -+ -+ WRITE(pDst, dst, s1); -+ dst++; -+ } -+ } -+} -+ -+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipSrc_0565x0565 ( -+ pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst, -+ int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y) -+{ -+ uint16_t *dstLine; -+ uint16_t *srcFirstLine; -+ uint16_t s1, s2; -+ int w; -+ int x1, x2, y; -+ int32_t orig_vx = vx; -+ -+ uint16_t *src, *dst; -+ int srcStride, dstStride; -+ PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); -+ /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be -+ * transformed from destination space to source space */ -+ PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint16_t, srcStride, srcFirstLine, 1); -+ -+ while (--height >= 0) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ -+ y = vy >> 16; -+ vy += unit_y; -+ -+ if ((y < 0) || (y >= pSrc->bits.height)) { -+ memset(dst, 0, width * sizeof(*dst)); -+ continue; -+ } -+ -+ src = srcFirstLine + srcStride * y; -+ -+ w = width; -+ vx = orig_vx; -+ while ((w -= 2) >= 0) -+ { -+ x1 = vx >> 16; -+ vx += unit_x; -+ s1 = READ(pSrc, src + x1); -+ -+ x2 = vx >> 16; -+ vx += unit_x; -+ s2 = READ(pSrc, src + x2); -+ -+ WRITE(pDst, dst, s1); -+ dst++; -+ WRITE(pDst, dst, s2); -+ dst++; -+ } -+ if (w & 1) { -+ x1 = vx >> 16; -+ vx += unit_x; -+ s1 = READ(pSrc, src + x1); -+ WRITE(pDst, dst, s1); -+ dst++; -+ } -+ } -+} -+ -+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatSrc_0565x0565 ( -+ pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst, -+ int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y) -+{ -+ uint16_t *dstLine; -+ uint16_t *srcFirstLine; -+ uint16_t s1, s2; -+ int w; -+ int x1, x2, y; -+ int32_t orig_vx = vx; -+ int32_t max_vx, max_vy; -+ -+ uint16_t *src, *dst; -+ int srcStride, dstStride; -+ PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); -+ /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be -+ * transformed from destination space to source space */ -+ PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint16_t, srcStride, srcFirstLine, 1); -+ -+ max_vx = pSrc->bits.width << 16; -+ max_vy = pSrc->bits.height << 16; -+ -+ while (orig_vx < 0) orig_vx += max_vx; -+ while (vy < 0) vy += max_vy; -+ while (orig_vx >= max_vx) orig_vx -= max_vx; -+ while (vy >= max_vy) vy -= max_vy; -+ -+ while (--height >= 0) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ -+ y = vy >> 16; -+ vy += unit_y; -+ while (vy >= max_vy) vy -= max_vy; -+ -+ src = srcFirstLine + srcStride * y; -+ -+ w = width; -+ vx = orig_vx; -+ while ((w -= 2) >= 0) -+ { -+ x1 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s1 = READ(pSrc, src + x1); -+ -+ x2 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s2 = READ(pSrc, src + x2); -+ -+ WRITE(pDst, dst, s1); -+ dst++; -+ WRITE(pDst, dst, s2); -+ dst++; -+ } -+ if (w & 1) { -+ x1 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s1 = READ(pSrc, src + x1); -+ -+ WRITE(pDst, dst, s1); -+ dst++; -+ } -+ } -+} -+ -+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipSrc_8888x0565 ( -+ pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst, -+ int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y) -+{ -+ uint16_t *dstLine; -+ uint32_t *srcFirstLine; -+ uint32_t s1, s2; -+ int w; -+ int x1, x2, y; -+ int32_t orig_vx = vx; -+ -+ uint32_t *src; -+ uint16_t *dst; -+ int srcStride, dstStride; -+ PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); -+ /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be -+ * transformed from destination space to source space */ -+ PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1); -+ -+ while (--height >= 0) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ -+ y = vy >> 16; -+ vy += unit_y; -+ -+ if ((y < 0) || (y >= pSrc->bits.height)) { -+ memset(dst, 0, width * sizeof(*dst)); -+ continue; -+ } -+ -+ src = srcFirstLine + srcStride * y; -+ -+ w = width; -+ vx = orig_vx; -+ while ((w -= 2) >= 0) -+ { -+ x1 = vx >> 16; -+ vx += unit_x; -+ s1 = READ(pSrc, src + x1); -+ -+ x2 = vx >> 16; -+ vx += unit_x; -+ s2 = READ(pSrc, src + x2); -+ -+ WRITE(pDst, dst, CONVERT_8888_TO_0565(s1)); -+ dst++; -+ WRITE(pDst, dst, CONVERT_8888_TO_0565(s2)); -+ dst++; -+ } -+ if (w & 1) { -+ x1 = vx >> 16; -+ vx += unit_x; -+ s1 = READ(pSrc, src + x1); -+ WRITE(pDst, dst, CONVERT_8888_TO_0565(s1)); -+ dst++; -+ } -+ } -+} -+ -+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatSrc_8888x0565 ( -+ pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst, -+ int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y) -+{ -+ uint16_t *dstLine; -+ uint32_t *srcFirstLine; -+ uint32_t s1, s2; -+ int w; -+ int x1, x2, y; -+ int32_t orig_vx = vx; -+ int32_t max_vx, max_vy; -+ -+ uint32_t *src; -+ uint16_t *dst; -+ int srcStride, dstStride; -+ PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); -+ /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be -+ * transformed from destination space to source space */ -+ PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1); -+ -+ max_vx = pSrc->bits.width << 16; -+ max_vy = pSrc->bits.height << 16; -+ -+ while (orig_vx < 0) orig_vx += max_vx; -+ while (vy < 0) vy += max_vy; -+ while (orig_vx >= max_vx) orig_vx -= max_vx; -+ while (vy >= max_vy) vy -= max_vy; -+ -+ while (--height >= 0) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ -+ y = vy >> 16; -+ vy += unit_y; -+ while (vy >= max_vy) vy -= max_vy; -+ -+ src = srcFirstLine + srcStride * y; -+ -+ w = width; -+ vx = orig_vx; -+ while ((w -= 2) >= 0) -+ { -+ x1 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s1 = READ(pSrc, src + x1); -+ -+ x2 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s2 = READ(pSrc, src + x2); -+ -+ WRITE(pDst, dst, CONVERT_8888_TO_0565(s1)); -+ dst++; -+ WRITE(pDst, dst, CONVERT_8888_TO_0565(s2)); -+ dst++; -+ } -+ if (w & 1) { -+ x1 = vx >> 16; -+ vx += unit_x; -+ while (vx >= max_vx) vx -= max_vx; -+ s1 = READ(pSrc, src + x1); -+ -+ WRITE(pDst, dst, CONVERT_8888_TO_0565(s1)); -+ dst++; -+ } -+ } -+} -+ -+/* -+ * Check if the clipping boundary is crossed on horizontal scaling -+ */ -+static inline pixman_bool_t -+fbTransformVerifyHorizontalClipping(pixman_image_t *pict, int width, int32_t vx, int32_t unit_x) -+{ -+ while (--width >= 0) { -+ int x = vx >> 16; -+ if ((x < 0) || (x >= pict->bits.width)) return 1; -+ vx += unit_x; -+ } -+ return 0; -+} -+ -+/* -+ * Check if the clipping boundary is crossed on vertical scaling -+ */ -+static inline pixman_bool_t -+fbTransformVerifyVerticalClipping(pixman_image_t *pict, int height, int32_t vy, int32_t unit_y) -+{ -+ while (--height >= 0) { -+ int y = vy >> 16; -+ if ((y < 0) || (y >= pict->bits.height)) return 1; -+ vy += unit_y; -+ } -+ return 0; -+} -+ -+/* -+ * Easy case of transform without rotation or complex clipping -+ * Returns 1 in the case if it was able to handle this operation and 0 otherwise -+ */ -+static pixman_bool_t -+fbCompositeTransformNonrotatedAffineTrivialclip ( -+ pixman_op_t op, -+ pixman_image_t *pSrc, -+ pixman_image_t *pMask, -+ pixman_image_t *pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ pixman_vector_t v, unit; -+ int skipdst_x = 0, skipdst_y = 0; -+ -+ /* Handle destination clipping */ -+ if (xDst < pDst->common.clip_region.extents.x1) { -+ skipdst_x = pDst->common.clip_region.extents.x1 - xDst; -+ if (skipdst_x >= (int)width) -+ return 1; -+ xDst = pDst->common.clip_region.extents.x1; -+ width -= skipdst_x; -+ } -+ -+ if (yDst < pDst->common.clip_region.extents.y1) { -+ skipdst_y = pDst->common.clip_region.extents.y1 - yDst; -+ if (skipdst_y >= (int)height) -+ return 1; -+ yDst = pDst->common.clip_region.extents.y1; -+ height -= skipdst_y; -+ } -+ -+ if (xDst >= pDst->common.clip_region.extents.x2 || -+ yDst >= pDst->common.clip_region.extents.y2) -+ { -+ return 1; -+ } -+ -+ if (xDst + width > pDst->common.clip_region.extents.x2) -+ width = pDst->common.clip_region.extents.x2 - xDst; -+ if (yDst + height > pDst->common.clip_region.extents.y2) -+ height = pDst->common.clip_region.extents.y2 - yDst; -+ -+ /* reference point is the center of the pixel */ -+ v.vector[0] = pixman_int_to_fixed(xSrc) + pixman_fixed_1 / 2; -+ v.vector[1] = pixman_int_to_fixed(ySrc) + pixman_fixed_1 / 2; -+ v.vector[2] = pixman_fixed_1; -+ -+ if (!pixman_transform_point_3d (pSrc->common.transform, &v)) -+ return 0; -+ -+ /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ -+ v.vector[0] -= pixman_fixed_e; -+ v.vector[1] -= pixman_fixed_e; -+ -+ unit.vector[0] = pSrc->common.transform->matrix[0][0]; -+ unit.vector[1] = pSrc->common.transform->matrix[1][1]; -+ -+ v.vector[0] += unit.vector[0] * skipdst_x; -+ v.vector[1] += unit.vector[1] * skipdst_y; -+ -+ /* Check for possible fixed point arithmetics problems/overflows */ -+ if (unit.vector[0] <= 0 || unit.vector[1] <= 0) -+ return 0; -+ if (width == 0 || height == 0) -+ return 0; -+ if ((uint32_t)width + (unit.vector[0] >> 16) >= 0x7FFF) -+ return 0; -+ if ((uint32_t)height + (unit.vector[1] >> 16) >= 0x7FFF) -+ return 0; -+ -+ /* Horizontal source clipping is only supported for NORMAL repeat */ -+ if (pSrc->common.repeat != PIXMAN_REPEAT_NORMAL -+ && fbTransformVerifyHorizontalClipping(pSrc, width, v.vector[0], unit.vector[0])) { -+ return 0; -+ } -+ -+ /* Vertical source clipping is only supported for NONE and NORMAL repeat */ -+ if (pSrc->common.repeat != PIXMAN_REPEAT_NONE && pSrc->common.repeat != PIXMAN_REPEAT_NORMAL -+ && fbTransformVerifyVerticalClipping(pSrc, height, v.vector[1], unit.vector[1])) { -+ return 0; -+ } -+ -+ if (op == PIXMAN_OP_OVER && pSrc->bits.format == PIXMAN_a8r8g8b8 -+ && (pDst->bits.format == PIXMAN_x8r8g8b8 || pDst->bits.format == PIXMAN_a8r8g8b8)) -+ { -+ if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat != PIXMAN_REPEAT_NORMAL) { -+ fbCompositeTransformNearestNonrotatedAffineTrivialclipOver_8888x8888( -+ pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height, -+ v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]); -+ return 1; -+ } -+ if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL) { -+ fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatOver_8888x8888( -+ pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height, -+ v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]); -+ return 1; -+ } -+ } -+ -+ if (op == PIXMAN_OP_SRC && (pSrc->bits.format == PIXMAN_x8r8g8b8 || pSrc->bits.format == PIXMAN_a8r8g8b8) -+ && (pDst->bits.format == PIXMAN_x8r8g8b8 || pDst->bits.format == pSrc->bits.format)) -+ { -+ if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat != PIXMAN_REPEAT_NORMAL) { -+ fbCompositeTransformNearestNonrotatedAffineTrivialclipSrc_8888x8888( -+ pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height, -+ v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]); -+ return 1; -+ } -+ if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL) { -+ fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatSrc_8888x8888( -+ pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height, -+ v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]); -+ return 1; -+ } -+ } -+ -+ if (op == PIXMAN_OP_OVER && pSrc->bits.format == PIXMAN_a8r8g8b8 && pDst->bits.format == PIXMAN_r5g6b5) -+ { -+ if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat != PIXMAN_REPEAT_NORMAL) { -+ fbCompositeTransformNearestNonrotatedAffineTrivialclipOver_8888x0565( -+ pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height, -+ v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]); -+ return 1; -+ } -+ if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL) { -+ fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatOver_8888x0565( -+ pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height, -+ v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]); -+ return 1; -+ } -+ } -+ -+ if (op == PIXMAN_OP_SRC && pSrc->bits.format == PIXMAN_r5g6b5 && pDst->bits.format == PIXMAN_r5g6b5) -+ { -+ if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat != PIXMAN_REPEAT_NORMAL) { -+ fbCompositeTransformNearestNonrotatedAffineTrivialclipSrc_0565x0565( -+ pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height, -+ v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]); -+ return 1; -+ } -+ if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL) { -+ fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatSrc_0565x0565( -+ pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height, -+ v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]); -+ return 1; -+ } -+ } -+ -+ if (op == PIXMAN_OP_SRC && (pSrc->bits.format == PIXMAN_x8r8g8b8 || pSrc->bits.format == PIXMAN_a8r8g8b8) -+ && pDst->bits.format == PIXMAN_r5g6b5) -+ { -+ if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat != PIXMAN_REPEAT_NORMAL) { -+ fbCompositeTransformNearestNonrotatedAffineTrivialclipSrc_8888x0565( -+ pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height, -+ v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]); -+ return 1; -+ } -+ if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL) { -+ fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatSrc_8888x0565( -+ pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height, -+ v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]); -+ return 1; -+ } -+ } -+ -+ /* No fastpath scaling implemented for this case */ -+ return 0; -+} -+ - static void - fast_path_composite (pixman_implementation_t *imp, - pixman_op_t op, -@@ -1279,6 +2266,30 @@ fast_path_composite (pixman_implementation_t *imp, - if (src->type == BITS - && src->common.transform - && !mask -+ && !src->common.alpha_map && !dest->common.alpha_map -+ && (src->common.filter == PIXMAN_FILTER_NEAREST) -+ && !src->bits.read_func && !src->bits.write_func -+ && !dest->bits.read_func && !dest->bits.write_func) -+ { -+ /* ensure that the transform matrix only has a scale */ -+ if (src->common.transform->matrix[0][1] == 0 && -+ src->common.transform->matrix[1][0] == 0 && -+ src->common.transform->matrix[2][0] == 0 && -+ src->common.transform->matrix[2][1] == 0 && -+ src->common.transform->matrix[2][2] == pixman_fixed_1 && -+ dest->common.clip_region.data == NULL) -+ { -+ if (fbCompositeTransformNonrotatedAffineTrivialclip (op, src, mask, dest, -+ src_x, src_y, mask_x, mask_y, dest_x, dest_y, width, height)) -+ { -+ return; -+ } -+ } -+ } -+ -+ if (src->type == BITS -+ && src->common.transform -+ && !mask - && op == PIXMAN_OP_SRC - && !src->common.alpha_map && !dest->common.alpha_map - && (src->common.filter == PIXMAN_FILTER_NEAREST) diff --git a/recipes/xorg-lib/pixman/over-n-8-0565.patch b/recipes/xorg-lib/pixman/over-n-8-0565.patch deleted file mode 100644 index 3911068d94..0000000000 --- a/recipes/xorg-lib/pixman/over-n-8-0565.patch +++ /dev/null @@ -1,231 +0,0 @@ -From de2221a32d0b6628116565563f7b4ccd0a44e8b6 Mon Sep 17 00:00:00 2001 -From: Siarhei Siamashka <siarhei.siamashka@nokia.com> -Date: Thu, 04 Mar 2010 23:20:25 +0000 -Subject: ARM: added 'armv6_composite_over_n_8_0565' fast path - -Provides ~3x performance improvement when working with -data in L1 cache and memory. This fast path is important -for fonts rendering when using 16bpp desktop. - -Microbenchmark from N800 (ARM11 @ 400MHz), measured in MPix/s: - -before: - - over_n_8_0565 = L1: 2.99 M: 2.86 - -after: - - over_n_8_0565 = L1: 9.07 M: 8.05 ---- -diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c -index 09a2888..c375c01 100644 ---- a/pixman/pixman-arm-simd.c -+++ b/pixman/pixman-arm-simd.c -@@ -419,6 +419,193 @@ arm_composite_over_n_8_8888 (pixman_implementation_t * impl, - } - } - -+#if defined(__ARM_EABI__) && defined(__linux__) -+/* -+ * ARMv6 assembly optimized version of 'composite_over_n_8_0565'. It is -+ * a bare metal 'naked' function which uses all the available CPU registers -+ * and is compatible with ARM EABI. It might (or might not) break when used -+ * with a different ABI, anyway it is better to be safe than sorry. -+ */ -+static void __attribute__((naked)) armv6_composite_over_n_8_0565_asm ( -+ uint16_t *dst, uint8_t *mask, uint32_t src, int w, -+ int dst_stride_delta, int mask_stride_delta, int h) -+{ -+ asm volatile ( -+ ".macro composite_internal_armv6_asm opaque_flag\n" -+ /* save all registers (8 words) to stack */ -+ "stmdb sp!, {r4-r11, ip, lr}\n" -+ /* some register aliases for better readability */ -+ "DST .req r0\n" -+ "MASK .req r1\n" -+ "S .req r2\n" -+ "W .req r3\n" -+ "A .req r8\n" -+ "D .req r10\n" -+ "C0000FF .req r11\n" -+ "C00001F .req r9\n" -+ "C800080 .req ip\n" -+ "CE000E0 .req lr\n" -+ /* precalculate some stuff and put it on stack */ -+ "mov r6, #0xF8\n" -+ "mov r7, #0xFC\n" -+ -+ "str W, [sp, #-8]!\n" -+ -+ ".if \\opaque_flag\n" -+ /* precalculate and save it to stack for later use: -+ * ((src >> 3) & 0x001F) | -+ * ((src >> 5) & 0x07E0) | -+ * ((src >> 8) & 0xF800) -+ */ -+ "mov A, #0x1F\n" -+ "and D, A, S, lsr #3\n" -+ "and r4, S, #0xF80000\n" -+ "and r5, S, #0xFC00\n" -+ "orr D, r4, lsr #8\n" -+ "orr D, r5, lsr #5\n" -+ "str D, [sp, #4]\n" -+ ".endif\n" -+ -+ "ldr D, [sp, #(8 + 10*4 + 8)]\n" /* h */ -+ "ldr A, =0xFF00FF\n" -+ "ldr C800080, =0x800080\n" -+ "ldr CE000E0, =0xE000E0\n" -+ "ldr C0000FF, =0xFF\n" -+ "ldr C00001F, =0x1F\n" -+ "and r4, A, S\n" /* r4 = src & 0x00FF00FF */ -+ "and r5, A, S, lsr #8\n" /* r5 = (src >> 8) & 0x00FF00FF */ -+ "stmdb sp!, {r4, r5, r6, r7}\n" -+ "0:\n" -+ "subs D, D, #1\n" -+ "blt 6f\n" -+ "1:\n" -+ "subs W, W, #1\n" -+ "blt 5f\n" -+ "2:\n" -+ "ldrb A, [MASK], #1\n" -+ "ldmia sp, {r4, r5, r6, r7}\n" /* load constants from stack */ -+ "add DST, DST, #2\n" -+ "cmp A, #0\n" -+ "beq 1b\n" -+ -+ ".if \\opaque_flag\n" -+ "cmp A, #0xFF\n" -+ "bne 3f\n" -+ "ldr D, [sp, #(4*4 + 4)]\n" /* load precalculated value */ -+ "subs W, #1\n" -+ "strh D, [DST, #-2]\n" -+ "bge 2b\n" -+ ".endif\n" -+ -+ "3:\n" -+ "ldrh D, [DST, #-2]\n" -+ "mla r4, A, r4, C800080\n" -+ "mla r5, A, r5, C800080\n" -+ "and r6, r6, D, lsl #3\n" /* & 0xF8 */ -+ "and r7, r7, D, lsr #3\n" /* & 0xFC */ -+ "and D, D, #0xF800\n" -+ "bic S, r4, #0xFF0000\n" -+ "bic A, r5, #0xFF0000\n" -+ "add r4, r4, S, lsr #8\n" -+ "add r5, r5, A, lsr #8\n" -+ -+ "and S, r7, #0xC0\n" -+ "orr r6, r6, D, lsl #8\n" -+ "and D, r6, CE000E0\n" -+ "eor A, C0000FF, r5, lsr #24\n" -+ "orr r6, D, lsr #5\n" -+ "orr r7, S, lsr #6\n" -+ -+ "mla r6, A, r6, C800080\n" -+ "mla r7, A, r7, C800080\n" -+ "subs W, #1\n" -+ "bic D, r6, #0xFF0000\n" -+ "bic A, r7, #0xFF0000\n" -+ "add r6, r6, D, lsr #8\n" -+ "uqadd8 r4, r4, r6\n" -+ "add r7, r7, A, lsr #8\n" -+ "uqadd8 r5, r5, r7\n" -+ "and D, C00001F, r4, lsr #11\n" -+ "and r4, r4, #0xF8000000\n" -+ "and r5, r5, #0xFC00\n" -+ "orr D, r4, lsr #16\n" -+ "orr D, r5, lsr #5\n" -+ "strh D, [DST, #-2]\n" -+ "bge 2b\n" -+ "5:\n" -+ "ldr r6, [sp, #(4*4 + 8 + 10*4 + 8)]\n" /* h */ -+ "ldr r4, [sp, #(4*4 + 8 + 10*4 + 4)]\n" /* mask stride */ -+ "ldr r5, [sp, #(4*4 + 8 + 10*4 + 0)]\n" /* dst stride */ -+ "ldr W, [sp, #(4*4)]\n" -+ "subs r6, r6, #1\n" /* h */ -+ "str r6, [sp, #(4*4 + 8 + 10*4 + 8)]\n" /* h */ -+ "add MASK, MASK, r4\n" -+ "add DST, DST, r5, lsl #1\n" -+ "bgt 1b\n" -+ "6:\n" -+ "add sp, sp, #(4*4 + 8)\n" -+ /* restore all registers and return */ -+ "ldmia sp!, {r4-r11, ip, pc}\n" -+ ".unreq DST\n" -+ ".unreq MASK\n" -+ ".unreq S\n" -+ ".unreq W\n" -+ ".unreq A\n" -+ ".unreq D\n" -+ ".unreq C0000FF\n" -+ ".unreq C00001F\n" -+ ".unreq C800080\n" -+ ".unreq CE000E0\n" -+ ".endm\n" -+ -+ "mov ip, r2, lsr #24\n" -+ "cmp ip, #0xFF\n" -+ "beq 9f\n" -+ "composite_internal_armv6_asm 0\n" -+ "9:\n" -+ "composite_internal_armv6_asm 1\n" -+ ".ltorg\n" -+ ".purgem composite_internal_armv6_asm\n" -+ ); -+} -+ -+static void -+armv6_composite_over_n_8_0565 (pixman_implementation_t * impl, -+ pixman_op_t op, -+ pixman_image_t * src_image, -+ pixman_image_t * mask_image, -+ pixman_image_t * dst_image, -+ int32_t src_x, -+ int32_t src_y, -+ int32_t mask_x, -+ int32_t mask_y, -+ int32_t dest_x, -+ int32_t dest_y, -+ int32_t width, -+ int32_t height) -+{ -+ uint32_t src; -+ uint16_t *dst; -+ uint8_t *mask; -+ int dst_stride, mask_stride; -+ -+ src = _pixman_image_get_solid (src_image, dst_image->bits.format); -+ -+ /* bail out if fully transparent */ -+ if (src == 0) -+ return; -+ -+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, -+ dst_stride, dst, 1); -+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, -+ mask_stride, mask, 1); -+ -+ armv6_composite_over_n_8_0565_asm (dst, mask, src, width, -+ dst_stride - width, mask_stride - width, height); -+} -+ -+#endif -+ - static const pixman_fast_path_t arm_simd_fast_paths[] = - { - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, arm_composite_over_8888_8888), -@@ -434,7 +621,10 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, arm_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, arm_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, arm_composite_over_n_8_8888), -- -+#if defined(__ARM_EABI__) && defined(__linux__) -+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, armv6_composite_over_n_8_0565), -+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, armv6_composite_over_n_8_0565), -+#endif - { PIXMAN_OP_NONE }, - }; - --- -cgit v0.8.3-6-g21f6 diff --git a/recipes/xorg-lib/pixman/pixman-0.13.2-neon1.patch b/recipes/xorg-lib/pixman/pixman-0.13.2-neon1.patch deleted file mode 100644 index b3bb762415..0000000000 --- a/recipes/xorg-lib/pixman/pixman-0.13.2-neon1.patch +++ /dev/null @@ -1,1702 +0,0 @@ -diff --git a/configure.ac b/configure.ac -index 063f6eb..bada55c 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -278,11 +278,12 @@ AC_SUBST(VMX_CFLAGS) - AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes) - - dnl Check for ARM SIMD instructions -+ARM_SIMD_CFLAGS="" - - have_arm_simd=no - AC_MSG_CHECKING(whether to use ARM SIMD assembler) - xserver_save_CFLAGS=$CFLAGS --CFLAGS="$CFLAGS $ARM_CFLAGS" -+CFLAGS="$CFLAGS $ARM_SIMD_CFLAGS" - AC_COMPILE_IFELSE([ - int main () { - asm("uqadd8 r1, r1, r2"); -@@ -302,7 +303,7 @@ fi - if test $have_arm_simd = yes ; then - AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD compiler intrinsics]) - else -- ARM_CFLAGS= -+ ARM_SIMD_CFLAGS= - fi - - AC_MSG_RESULT($have_arm_simd) -@@ -310,9 +311,48 @@ if test $enable_arm_simd = yes && test $have_arm_simd = no ; then - AC_MSG_ERROR([ARM SIMD intrinsics not detected]) - fi - --AC_SUBST(ARM_CFLAGS) -+dnl Check for ARM NEON instructions -+ARM_NEON_CFLAGS="-mcpu=cortex-a8 -mfpu=neon" -+ -+have_arm_neon=no -+AC_MSG_CHECKING(whether to use ARM NEON) -+xserver_save_CFLAGS=$CFLAGS -+CFLAGS="$CFLAGS $ARM_NEON_CFLAGS" -+AC_COMPILE_IFELSE([ -+#include <arm_neon.h> -+int main () { -+ uint8x8_t neon_test=vmov_n_u8(0); -+ return 0; -+}], have_arm_neon=yes) -+CFLAGS=$xserver_save_CFLAGS -+ -+AC_ARG_ENABLE(arm-neon, -+ [AC_HELP_STRING([--disable-arm-neon], -+ [disable ARM NEON fast paths])], -+ [enable_arm_neon=$enableval], [enable_arm_neon=auto]) -+ -+if test $enable_arm_neon = no ; then -+ have_arm_neon=disabled -+fi -+ -+if test $have_arm_neon = yes ; then -+ AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON compiler intrinsics]) -+else -+ ARM_NEON_CFLAGS= -+fi -+ -+AC_MSG_RESULT($have_arm_neon) -+if test $enable_arm_neon = yes && test $have_arm_neon = no ; then -+ AC_MSG_ERROR([ARM NEON intrinsics not detected]) -+fi -+ -+ -+AC_SUBST(ARM_SIMD_CFLAGS) -+AC_SUBST(ARM_NEON_CFLAGS) - - AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes) -+AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes) -+ - - - AC_ARG_ENABLE(gtk, -diff --git a/pixman/Makefile.am b/pixman/Makefile.am -index c4612ea..4c1ec6b 100644 ---- a/pixman/Makefile.am -+++ b/pixman/Makefile.am -@@ -80,15 +80,26 @@ libpixman_sse2_la_LIBADD = $(DEP_LIBS) - libpixman_1_la_LIBADD += libpixman-sse2.la - endif - --# arm code -+# arm simd code - if USE_ARM_SIMD - noinst_LTLIBRARIES += libpixman-arm-simd.la - libpixman_arm_simd_la_SOURCES = \ - pixman-arm-simd.c \ - pixman-arm-simd.h --libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS) $(ARM_CFLAGS) -+libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS) $(ARM_SIMD_CFLAGS) - libpixman_arm_simd_la_LIBADD = $(DEP_LIBS) - libpixman_1_la_LIBADD += libpixman-arm-simd.la - endif - -+# arm neon code -+if USE_ARM_NEON -+noinst_LTLIBRARIES += libpixman-arm-neon.la -+libpixman_arm_neon_la_SOURCES = \ -+ pixman-arm-neon.c \ -+ pixman-arm-neon.h -+libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS) $(ARM_NEON_CFLAGS) -+libpixman_arm_neon_la_LIBADD = $(DEP_LIBS) -+libpixman_1_la_LIBADD += libpixman-arm-neon.la -+endif -+ - -diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c -new file mode 100644 -index 0000000..10050e4 ---- /dev/null -+++ b/pixman/pixman-arm-neon.c -@@ -0,0 +1,1387 @@ -+/* -+ * Copyright © 2009 Mozilla Corporation -+ * -+ * Permission to use, copy, modify, distribute, and sell this software and its -+ * documentation for any purpose is hereby granted without fee, provided that -+ * the above copyright notice appear in all copies and that both that -+ * copyright notice and this permission notice appear in supporting -+ * documentation, and that the name of Mozilla Corporation not be used in -+ * advertising or publicity pertaining to distribution of the software without -+ * specific, written prior permission. Mozilla Corporation makes no -+ * representations about the suitability of this software for any purpose. It -+ * is provided "as is" without express or implied warranty. -+ * -+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS -+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND -+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY -+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN -+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING -+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS -+ * SOFTWARE. -+ * -+ * Author: Ian Rickards (ian.rickards@arm.com) -+ * -+ */ -+ -+#ifdef HAVE_CONFIG_H -+#include <config.h> -+#endif -+ -+#include "pixman-arm-neon.h" -+ -+#include <arm_neon.h> -+ -+ -+#if !defined(__ARMCC_VERSION) && !defined(FORCE_NO_NEON_INLINE_ASM) -+// [both armcc & gcc set __GNUC__] -+// Use GNU style inline asm on gcc, for best performance -+// Use intrinsics on armcc -+// This switch determines if any GNU style inline asm is allowed -+#define USE_NEON_INLINE_ASM -+#endif -+ -+ -+static force_inline uint8x8x4_t unpack0565(uint16x8_t rgb) -+{ -+ uint16x8_t gb, b; -+ uint8x8x4_t res; -+ -+ res.val[3] = vdup_n_u8(0); -+ gb = vshrq_n_u16(rgb, 5); -+ b = vshrq_n_u16(rgb, 5+6); -+ res.val[0] = vmovn_u16(rgb); // get low 5 bits -+ res.val[1] = vmovn_u16(gb); // get mid 6 bits -+ res.val[2] = vmovn_u16(b); // get top 5 bits -+ -+ res.val[0] = vshl_n_u8(res.val[0], 3); // shift to top -+ res.val[1] = vshl_n_u8(res.val[1], 2); // shift to top -+ res.val[2] = vshl_n_u8(res.val[2], 3); // shift to top -+ -+ res.val[0] = vsri_n_u8(res.val[0], res.val[0], 5); -+ res.val[1] = vsri_n_u8(res.val[1], res.val[1], 6); -+ res.val[2] = vsri_n_u8(res.val[2], res.val[2], 5); -+ -+ return res; -+} -+ -+static force_inline uint16x8_t pack0565(uint8x8x4_t s) -+{ -+ uint16x8_t rgb, val_g, val_r; -+ -+ rgb = vshll_n_u8(s.val[2],8); -+ val_g = vshll_n_u8(s.val[1],8); -+ val_r = vshll_n_u8(s.val[0],8); -+ rgb = vsriq_n_u16(rgb, val_g, 5); -+ rgb = vsriq_n_u16(rgb, val_r, 5+6); -+ -+ return rgb; -+} -+ -+static force_inline uint8x8_t neon2mul(uint8x8_t x, uint8x8_t alpha) -+{ -+ uint16x8_t tmp,tmp2; -+ uint8x8_t res; -+ -+ tmp = vmull_u8(x,alpha); -+ tmp2 = vrshrq_n_u16(tmp,8); -+ res = vraddhn_u16(tmp,tmp2); -+ -+ return res; -+} -+ -+static force_inline uint8x8x4_t neon8mul(uint8x8x4_t x, uint8x8_t alpha) -+{ -+ uint16x8x4_t tmp; -+ uint8x8x4_t res; -+ uint16x8_t qtmp1,qtmp2; -+ -+ tmp.val[0] = vmull_u8(x.val[0],alpha); -+ tmp.val[1] = vmull_u8(x.val[1],alpha); -+ tmp.val[2] = vmull_u8(x.val[2],alpha); -+ tmp.val[3] = vmull_u8(x.val[3],alpha); -+ -+ qtmp1 = vrshrq_n_u16(tmp.val[0],8); -+ qtmp2 = vrshrq_n_u16(tmp.val[1],8); -+ res.val[0] = vraddhn_u16(tmp.val[0],qtmp1); -+ qtmp1 = vrshrq_n_u16(tmp.val[2],8); -+ res.val[1] = vraddhn_u16(tmp.val[1],qtmp2); -+ qtmp2 = vrshrq_n_u16(tmp.val[3],8); -+ res.val[2] = vraddhn_u16(tmp.val[2],qtmp1); -+ res.val[3] = vraddhn_u16(tmp.val[3],qtmp2); -+ -+ return res; -+} -+ -+static force_inline uint8x8x4_t neon8qadd(uint8x8x4_t x, uint8x8x4_t y) -+{ -+ uint8x8x4_t res; -+ -+ res.val[0] = vqadd_u8(x.val[0],y.val[0]); -+ res.val[1] = vqadd_u8(x.val[1],y.val[1]); -+ res.val[2] = vqadd_u8(x.val[2],y.val[2]); -+ res.val[3] = vqadd_u8(x.val[3],y.val[3]); -+ -+ return res; -+} -+ -+ -+void -+fbCompositeSrcAdd_8000x8000neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ uint8_t *dstLine, *dst; -+ uint8_t *srcLine, *src; -+ int dstStride, srcStride; -+ uint16_t w; -+ -+ fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1); -+ fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); -+ -+ if (width>=8) -+ { -+ // Use overlapping 8-pixel method -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ src = srcLine; -+ srcLine += srcStride; -+ w = width; -+ -+ uint8_t *keep_dst; -+ -+#ifndef USE_NEON_INLINE_ASM -+ uint8x8_t sval,dval,temp; -+ -+ sval = vld1_u8((void*)src); -+ dval = vld1_u8((void*)dst); -+ keep_dst = dst; -+ -+ temp = vqadd_u8(dval,sval); -+ -+ src += (w & 7); -+ dst += (w & 7); -+ w -= (w & 7); -+ -+ while (w) -+ { -+ sval = vld1_u8((void*)src); -+ dval = vld1_u8((void*)dst); -+ -+ vst1_u8((void*)keep_dst,temp); -+ keep_dst = dst; -+ -+ temp = vqadd_u8(dval,sval); -+ -+ src+=8; -+ dst+=8; -+ w-=8; -+ } -+ vst1_u8((void*)keep_dst,temp); -+#else -+ asm volatile ( -+// avoid using d8-d15 (q4-q7) aapcs callee-save registers -+ "vld1.8 {d0}, [%[src]]\n\t" -+ "vld1.8 {d4}, [%[dst]]\n\t" -+ "mov %[keep_dst], %[dst]\n\t" -+ -+ "and ip, %[w], #7\n\t" -+ "add %[src], %[src], ip\n\t" -+ "add %[dst], %[dst], ip\n\t" -+ "subs %[w], %[w], ip\n\t" -+ "b 9f\n\t" -+// LOOP -+ "2:\n\t" -+ "vld1.8 {d0}, [%[src]]!\n\t" -+ "vld1.8 {d4}, [%[dst]]!\n\t" -+ "vst1.8 {d20}, [%[keep_dst]]\n\t" -+ "sub %[keep_dst], %[dst], #8\n\t" -+ "subs %[w], %[w], #8\n\t" -+ "9:\n\t" -+ "vqadd.u8 d20, d0, d4\n\t" -+ -+ "bne 2b\n\t" -+ -+ "1:\n\t" -+ "vst1.8 {d20}, [%[keep_dst]]\n\t" -+ -+ : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "+r" (keep_dst) -+ : -+ : "ip", "cc", "memory", "d0","d4", -+ "d20" -+ ); -+#endif -+ } -+ } -+ else -+ { -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ src = srcLine; -+ srcLine += srcStride; -+ w = width; -+ uint8x8_t sval, dval; -+ uint8_t *dst4, *dst2; -+ -+ if (w&4) -+ { -+ sval = vreinterpret_u8_u32(vld1_lane_u32((void*)src,vreinterpret_u32_u8(sval),1)); -+ dval = vreinterpret_u8_u32(vld1_lane_u32((void*)dst,vreinterpret_u32_u8(dval),1)); -+ dst4=dst; -+ src+=4; -+ dst+=4; -+ } -+ if (w&2) -+ { -+ sval = vreinterpret_u8_u16(vld1_lane_u16((void*)src,vreinterpret_u16_u8(sval),1)); -+ dval = vreinterpret_u8_u16(vld1_lane_u16((void*)dst,vreinterpret_u16_u8(dval),1)); -+ dst2=dst; -+ src+=2; -+ dst+=2; -+ } -+ if (w&1) -+ { -+ sval = vld1_lane_u8((void*)src,sval,1); -+ dval = vld1_lane_u8((void*)dst,dval,1); -+ } -+ -+ dval = vqadd_u8(dval,sval); -+ -+ if (w&1) -+ vst1_lane_u8((void*)dst,dval,1); -+ if (w&2) -+ vst1_lane_u16((void*)dst2,vreinterpret_u16_u8(dval),1); -+ if (w&4) -+ vst1_lane_u32((void*)dst4,vreinterpret_u32_u8(dval),1); -+ } -+ } -+} -+ -+ -+void -+fbCompositeSrc_8888x8888neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ uint32_t *dstLine, *dst; -+ uint32_t *srcLine, *src; -+ int dstStride, srcStride; -+ uint32_t w; -+ -+ fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); -+ fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); -+ -+ if (width>=8) -+ { -+ // Use overlapping 8-pixel method -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ src = srcLine; -+ srcLine += srcStride; -+ w = width; -+ -+ uint32_t *keep_dst; -+ -+#ifndef USE_NEON_INLINE_ASM -+ uint8x8x4_t sval,dval,temp; -+ -+ sval = vld4_u8((void*)src); -+ dval = vld4_u8((void*)dst); -+ keep_dst = dst; -+ -+ temp = neon8mul(dval,vmvn_u8(sval.val[3])); -+ temp = neon8qadd(sval,temp); -+ -+ src += (w & 7); -+ dst += (w & 7); -+ w -= (w & 7); -+ -+ while (w) -+ { -+ sval = vld4_u8((void*)src); -+ dval = vld4_u8((void*)dst); -+ -+ vst4_u8((void*)keep_dst,temp); -+ keep_dst = dst; -+ -+ temp = neon8mul(dval,vmvn_u8(sval.val[3])); -+ temp = neon8qadd(sval,temp); -+ -+ src+=8; -+ dst+=8; -+ w-=8; -+ } -+ vst4_u8((void*)keep_dst,temp); -+#else -+ asm volatile ( -+// avoid using d8-d15 (q4-q7) aapcs callee-save registers -+ "vld4.8 {d0-d3}, [%[src]]\n\t" -+ "vld4.8 {d4-d7}, [%[dst]]\n\t" -+ "mov %[keep_dst], %[dst]\n\t" -+ -+ "and ip, %[w], #7\n\t" -+ "add %[src], %[src], ip, LSL#2\n\t" -+ "add %[dst], %[dst], ip, LSL#2\n\t" -+ "subs %[w], %[w], ip\n\t" -+ "b 9f\n\t" -+// LOOP -+ "2:\n\t" -+ "vld4.8 {d0-d3}, [%[src]]!\n\t" -+ "vld4.8 {d4-d7}, [%[dst]]!\n\t" -+ "vst4.8 {d20-d23}, [%[keep_dst]]\n\t" -+ "sub %[keep_dst], %[dst], #8*4\n\t" -+ "subs %[w], %[w], #8\n\t" -+ "9:\n\t" -+ "vmvn.8 d31, d3\n\t" -+ "vmull.u8 q10, d31, d4\n\t" -+ "vmull.u8 q11, d31, d5\n\t" -+ "vmull.u8 q12, d31, d6\n\t" -+ "vmull.u8 q13, d31, d7\n\t" -+ "vrshr.u16 q8, q10, #8\n\t" -+ "vrshr.u16 q9, q11, #8\n\t" -+ "vraddhn.u16 d20, q10, q8\n\t" -+ "vraddhn.u16 d21, q11, q9\n\t" -+ "vrshr.u16 q8, q12, #8\n\t" -+ "vrshr.u16 q9, q13, #8\n\t" -+ "vraddhn.u16 d22, q12, q8\n\t" -+ "vraddhn.u16 d23, q13, q9\n\t" -+// result in d20-d23 -+ "vqadd.u8 d20, d0, d20\n\t" -+ "vqadd.u8 d21, d1, d21\n\t" -+ "vqadd.u8 d22, d2, d22\n\t" -+ "vqadd.u8 d23, d3, d23\n\t" -+ -+ "bne 2b\n\t" -+ -+ "1:\n\t" -+ "vst4.8 {d20-d23}, [%[keep_dst]]\n\t" -+ -+ : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "+r" (keep_dst) -+ : -+ : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", -+ "d16","d17","d18","d19","d20","d21","d22","d23" -+ ); -+#endif -+ } -+ } -+ else -+ { -+ uint8x8_t alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL)); -+ -+ // Handle width<8 -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ src = srcLine; -+ srcLine += srcStride; -+ w = width; -+ -+ while (w>=2) -+ { -+ uint8x8_t sval,dval; -+ -+ /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */ -+ sval = vreinterpret_u8_u32(vld1_u32((void*)src)); -+ dval = vreinterpret_u8_u32(vld1_u32((void*)dst)); -+ dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector)); -+ vst1_u8((void*)dst,vqadd_u8(sval,dval)); -+ -+ src+=2; -+ dst+=2; -+ w-=2; -+ } -+ -+ if (w) -+ { -+ uint8x8_t sval,dval; -+ -+ /* single 32-bit pixel in lane 0 */ -+ sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src)); // only interested in lane 0 -+ dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst)); // only interested in lane 0 -+ dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector)); -+ vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0); -+ } -+ } -+ } -+} -+ -+ -+ -+void -+fbCompositeSrc_x888x0565neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ uint16_t *dstLine, *dst; -+ uint32_t *srcLine, *src; -+ int dstStride, srcStride; -+ uint32_t w; -+ -+ fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); -+ fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); -+ -+ if (width>=8) -+ { -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ src = srcLine; -+ srcLine += srcStride; -+ w = width; -+ -+ do { -+ while (w>=8) -+ { -+#ifndef USE_NEON_INLINE_ASM -+ vst1q_u16(dst, pack0565(vld4_u8((void*)src))); -+#else -+ asm volatile ( -+ "vld4.8 {d4-d7}, [%[src]]\n\t" -+ "vshll.u8 q0, d6, #8\n\t" -+ "vshll.u8 q1, d5, #8\n\t" -+ "vsriq.u16 q0, q1, #5\t\n" -+ "vshll.u8 q1, d4, #8\n\t" -+ "vsriq.u16 q0, q1, #11\t\n" -+ "vst1.16 {q0}, [%[dst]]\n\t" -+ : -+ : [dst] "r" (dst), [src] "r" (src) -+ : "memory", "d0","d1","d2","d3","d4","d5","d6","d7" -+ ); -+#endif -+ src+=8; -+ dst+=8; -+ w-=8; -+ } -+ if (w != 0) -+ { -+ src -= (8-w); -+ dst -= (8-w); -+ w = 8; // do another vector -+ } -+ } while (w!=0); -+ } -+ } -+ else -+ { -+ // Handle width<8 -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ src = srcLine; -+ srcLine += srcStride; -+ w = width; -+ -+ while (w>=2) -+ { -+ uint32x2_t sval, rgb, g, b; -+ sval = vld1_u32(src); -+ rgb = vshr_n_u32(sval,8-5); // r (5 bits) -+ g = vshr_n_u32(sval,8+8-6); // g to bottom byte -+ rgb = vsli_n_u32(rgb, g, 5); -+ b = vshr_n_u32(sval,8+8+8-5); // b to bottom byte -+ rgb = vsli_n_u32(rgb, b, 11); -+ vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),0); -+ vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),2); -+ src+=2; -+ w-=2; -+ } -+ if (w) -+ { -+ uint32x2_t sval, rgb, g, b; -+ sval = vld1_dup_u32(src); -+ rgb = vshr_n_u32(sval,8-5); // r (5 bits) -+ g = vshr_n_u32(sval,8+8-6); // g to bottom byte -+ rgb = vsli_n_u32(rgb, g, 5); -+ b = vshr_n_u32(sval,8+8+8-5); // b to bottom byte -+ rgb = vsli_n_u32(rgb, b, 11); -+ vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),0); -+ } -+ } -+ } -+} -+ -+ -+void -+fbCompositeSrc_8888x8x8888neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ uint32_t *dstLine, *dst; -+ uint32_t *srcLine, *src; -+ uint32_t mask; -+ int dstStride, srcStride; -+ uint32_t w; -+ uint8x8_t mask_alpha; -+ -+ fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); -+ fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); -+ -+ fbComposeGetSolid (pMask, mask, pDst->bits.format); -+ mask_alpha = vdup_n_u8((mask) >> 24); -+ -+ if (width>=8) -+ { -+ // Use overlapping 8-pixel method -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ src = srcLine; -+ srcLine += srcStride; -+ w = width; -+ -+ uint32_t *keep_dst; -+ -+#ifndef USE_NEON_INLINE_ASM -+ uint8x8x4_t sval,dval,temp; -+ -+ sval = vld4_u8((void*)src); -+ dval = vld4_u8((void*)dst); -+ keep_dst = dst; -+ -+ sval = neon8mul(sval,mask_alpha); -+ temp = neon8mul(dval,vmvn_u8(sval.val[3])); -+ temp = neon8qadd(sval,temp); -+ -+ src += (w & 7); -+ dst += (w & 7); -+ w -= (w & 7); -+ -+ while (w) -+ { -+ sval = vld4_u8((void*)src); -+ dval = vld4_u8((void*)dst); -+ -+ vst4_u8((void*)keep_dst,temp); -+ keep_dst = dst; -+ -+ sval = neon8mul(sval,mask_alpha); -+ temp = neon8mul(dval,vmvn_u8(sval.val[3])); -+ temp = neon8qadd(sval,temp); -+ -+ src+=8; -+ dst+=8; -+ w-=8; -+ } -+ vst4_u8((void*)keep_dst,temp); -+#else -+ asm volatile ( -+// avoid using d8-d15 (q4-q7) aapcs callee-save registers -+ "vdup.32 d30, %[mask]\n\t" -+ "vdup.8 d30, d30[3]\n\t" -+ -+ "vld4.8 {d0-d3}, [%[src]]\n\t" -+ "vld4.8 {d4-d7}, [%[dst]]\n\t" -+ "mov %[keep_dst], %[dst]\n\t" -+ -+ "and ip, %[w], #7\n\t" -+ "add %[src], %[src], ip, LSL#2\n\t" -+ "add %[dst], %[dst], ip, LSL#2\n\t" -+ "subs %[w], %[w], ip\n\t" -+ "b 9f\n\t" -+// LOOP -+ "2:\n\t" -+ "vld4.8 {d0-d3}, [%[src]]!\n\t" -+ "vld4.8 {d4-d7}, [%[dst]]!\n\t" -+ "vst4.8 {d20-d23}, [%[keep_dst]]\n\t" -+ "sub %[keep_dst], %[dst], #8*4\n\t" -+ "subs %[w], %[w], #8\n\t" -+ -+ "9:\n\t" -+ "vmull.u8 q10, d30, d0\n\t" -+ "vmull.u8 q11, d30, d1\n\t" -+ "vmull.u8 q12, d30, d2\n\t" -+ "vmull.u8 q13, d30, d3\n\t" -+ "vrshr.u16 q8, q10, #8\n\t" -+ "vrshr.u16 q9, q11, #8\n\t" -+ "vraddhn.u16 d0, q10, q8\n\t" -+ "vraddhn.u16 d1, q11, q9\n\t" -+ "vrshr.u16 q9, q13, #8\n\t" -+ "vrshr.u16 q8, q12, #8\n\t" -+ "vraddhn.u16 d3, q13, q9\n\t" -+ "vraddhn.u16 d2, q12, q8\n\t" -+ -+ "vmvn.8 d31, d3\n\t" -+ "vmull.u8 q10, d31, d4\n\t" -+ "vmull.u8 q11, d31, d5\n\t" -+ "vmull.u8 q12, d31, d6\n\t" -+ "vmull.u8 q13, d31, d7\n\t" -+ "vrshr.u16 q8, q10, #8\n\t" -+ "vrshr.u16 q9, q11, #8\n\t" -+ "vraddhn.u16 d20, q10, q8\n\t" -+ "vrshr.u16 q8, q12, #8\n\t" -+ "vraddhn.u16 d21, q11, q9\n\t" -+ "vrshr.u16 q9, q13, #8\n\t" -+ "vraddhn.u16 d22, q12, q8\n\t" -+ "vraddhn.u16 d23, q13, q9\n\t" -+// result in d20-d23 -+ "vqadd.u8 d20, d0, d20\n\t" -+ "vqadd.u8 d21, d1, d21\n\t" -+ "vqadd.u8 d22, d2, d22\n\t" -+ "vqadd.u8 d23, d3, d23\n\t" -+ -+ "bne 2b\n\t" -+ -+ "1:\n\t" -+ "vst4.8 {d20-d23}, [%[keep_dst]]\n\t" -+ -+ : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "+r" (keep_dst) -+ : [mask] "r" (mask) -+ : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", -+ "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27", -+ "d30","d31" -+ ); -+#endif -+ } -+ } -+ else -+ { -+ uint8x8_t alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL)); -+ -+ // Handle width<8 -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ src = srcLine; -+ srcLine += srcStride; -+ w = width; -+ -+ while (w>=2) -+ { -+ uint8x8_t sval,dval; -+ -+ sval = vreinterpret_u8_u32(vld1_u32((void*)src)); -+ dval = vreinterpret_u8_u32(vld1_u32((void*)dst)); -+ -+ /* sval * const alpha_mul */ -+ sval = neon2mul(sval,mask_alpha); -+ -+ /* dval * 255-(src alpha) */ -+ dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector)); -+ -+ vst1_u8((void*)dst,vqadd_u8(sval,dval)); -+ -+ src+=2; -+ dst+=2; -+ w-=2; -+ } -+ -+ if (w) -+ { -+ uint8x8_t sval,dval; -+ -+ sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src)); -+ dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst)); -+ -+ /* sval * const alpha_mul */ -+ sval = neon2mul(sval,mask_alpha); -+ -+ /* dval * 255-(src alpha) */ -+ dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector)); -+ -+ vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0); -+ } -+ } -+ } -+} -+ -+ -+ -+void -+fbCompositeSolidMask_nx8x0565neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ uint32_t src, srca; -+ uint16_t *dstLine, *dst; -+ uint8_t *maskLine, *mask; -+ int dstStride, maskStride; -+ uint32_t w; -+ uint8x8_t sval2; -+ uint8x8x4_t sval8; -+ -+ fbComposeGetSolid(pSrc, src, pDst->bits.format); -+ -+ srca = src >> 24; -+ if (src == 0) -+ return; -+ -+ sval2=vreinterpret_u8_u32(vdup_n_u32(src)); -+ sval8.val[0]=vdup_lane_u8(sval2,0); -+ sval8.val[1]=vdup_lane_u8(sval2,1); -+ sval8.val[2]=vdup_lane_u8(sval2,2); -+ sval8.val[3]=vdup_lane_u8(sval2,3); -+ -+ fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); -+ fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); -+ -+ if (width>=8) -+ { -+ // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused -+ while (height--) -+ { -+ uint16_t *keep_dst; -+ -+ dst = dstLine; -+ dstLine += dstStride; -+ mask = maskLine; -+ maskLine += maskStride; -+ w = width; -+ -+#ifndef USE_NEON_INLINE_ASM -+ uint8x8_t alpha; -+ uint16x8_t dval, temp; -+ uint8x8x4_t sval8temp; -+ -+ alpha = vld1_u8((void*)mask); -+ dval = vld1q_u16((void*)dst); -+ keep_dst = dst; -+ -+ sval8temp = neon8mul(sval8,alpha); -+ temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3])))); -+ -+ mask += (w & 7); -+ dst += (w & 7); -+ w -= (w & 7); -+ -+ while (w) -+ { -+ dval = vld1q_u16((void*)dst); -+ alpha = vld1_u8((void*)mask); -+ -+ vst1q_u16((void*)keep_dst,temp); -+ keep_dst = dst; -+ -+ sval8temp = neon8mul(sval8,alpha); -+ temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3])))); -+ -+ mask+=8; -+ dst+=8; -+ w-=8; -+ } -+ vst1q_u16((void*)keep_dst,temp); -+#else -+ asm volatile ( -+ "vdup.32 d0, %[src]\n\t" -+ "vdup.8 d1, d0[1]\n\t" -+ "vdup.8 d2, d0[2]\n\t" -+ "vdup.8 d3, d0[3]\n\t" -+ "vdup.8 d0, d0[0]\n\t" -+ -+ "vld1.8 {q12}, [%[dst]]\n\t" -+ "vld1.8 {d31}, [%[mask]]\n\t" -+ "mov %[keep_dst], %[dst]\n\t" -+ -+ "and ip, %[w], #7\n\t" -+ "add %[mask], %[mask], ip\n\t" -+ "add %[dst], %[dst], ip, LSL#1\n\t" -+ "subs %[w], %[w], ip\n\t" -+ "b 9f\n\t" -+// LOOP -+ "2:\n\t" -+ -+ "vld1.16 {q12}, [%[dst]]!\n\t" -+ "vld1.8 {d31}, [%[mask]]!\n\t" -+ "vst1.16 {q10}, [%[keep_dst]]\n\t" -+ "sub %[keep_dst], %[dst], #8*2\n\t" -+ "subs %[w], %[w], #8\n\t" -+ "9:\n\t" -+// expand 0565 q12 to 8888 {d4-d7} -+ "vmovn.u16 d4, q12\t\n" -+ "vshr.u16 q11, q12, #5\t\n" -+ "vshr.u16 q10, q12, #6+5\t\n" -+ "vmovn.u16 d5, q11\t\n" -+ "vmovn.u16 d6, q10\t\n" -+ "vshl.u8 d4, d4, #3\t\n" -+ "vshl.u8 d5, d5, #2\t\n" -+ "vshl.u8 d6, d6, #3\t\n" -+ "vsri.u8 d4, d4, #5\t\n" -+ "vsri.u8 d5, d5, #6\t\n" -+ "vsri.u8 d6, d6, #5\t\n" -+ -+ "vmull.u8 q10, d31, d0\n\t" -+ "vmull.u8 q11, d31, d1\n\t" -+ "vmull.u8 q12, d31, d2\n\t" -+ "vmull.u8 q13, d31, d3\n\t" -+ "vrshr.u16 q8, q10, #8\n\t" -+ "vrshr.u16 q9, q11, #8\n\t" -+ "vraddhn.u16 d20, q10, q8\n\t" -+ "vraddhn.u16 d21, q11, q9\n\t" -+ "vrshr.u16 q9, q13, #8\n\t" -+ "vrshr.u16 q8, q12, #8\n\t" -+ "vraddhn.u16 d23, q13, q9\n\t" -+ "vraddhn.u16 d22, q12, q8\n\t" -+ -+// duplicate in 4/2/1 & 8pix vsns -+ "vmvn.8 d30, d23\n\t" -+ "vmull.u8 q14, d30, d6\n\t" -+ "vmull.u8 q13, d30, d5\n\t" -+ "vmull.u8 q12, d30, d4\n\t" -+ "vrshr.u16 q8, q14, #8\n\t" -+ "vrshr.u16 q9, q13, #8\n\t" -+ "vraddhn.u16 d6, q14, q8\n\t" -+ "vrshr.u16 q8, q12, #8\n\t" -+ "vraddhn.u16 d5, q13, q9\n\t" -+ "vqadd.u8 d6, d6, d22\n\t" // moved up -+ "vraddhn.u16 d4, q12, q8\n\t" -+// intentionally don't calculate alpha -+// result in d4-d6 -+ -+// "vqadd.u8 d6, d6, d22\n\t" ** moved up -+ "vqadd.u8 d5, d5, d21\n\t" -+ "vqadd.u8 d4, d4, d20\n\t" -+ -+// pack 8888 {d20-d23} to 0565 q10 -+ "vshll.u8 q10, d6, #8\n\t" -+ "vshll.u8 q3, d5, #8\n\t" -+ "vshll.u8 q2, d4, #8\n\t" -+ "vsri.u16 q10, q3, #5\t\n" -+ "vsri.u16 q10, q2, #11\t\n" -+ -+ "bne 2b\n\t" -+ -+ "1:\n\t" -+ "vst1.16 {q10}, [%[keep_dst]]\n\t" -+ -+ : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "+r" (keep_dst) -+ : [src] "r" (src) -+ : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", -+ "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", -+ "d30","d31" -+ ); -+#endif -+ } -+ } -+ else -+ { -+ while (height--) -+ { -+ void *dst4, *dst2; -+ -+ dst = dstLine; -+ dstLine += dstStride; -+ mask = maskLine; -+ maskLine += maskStride; -+ w = width; -+ -+ -+#ifndef USE_NEON_INLINE_ASM -+ uint8x8_t alpha; -+ uint16x8_t dval, temp; -+ uint8x8x4_t sval8temp; -+ -+ if (w&4) -+ { -+ alpha = vreinterpret_u8_u32(vld1_lane_u32((void*)mask,vreinterpret_u32_u8(alpha),1)); -+ dval = vreinterpretq_u16_u64(vld1q_lane_u64((void*)dst,vreinterpretq_u64_u16(dval),1)); -+ dst4=dst; -+ mask+=4; -+ dst+=4; -+ } -+ if (w&2) -+ { -+ alpha = vreinterpret_u8_u16(vld1_lane_u16((void*)mask,vreinterpret_u16_u8(alpha),1)); -+ dval = vreinterpretq_u16_u32(vld1q_lane_u32((void*)dst,vreinterpretq_u32_u16(dval),1)); -+ dst2=dst; -+ mask+=2; -+ dst+=2; -+ } -+ if (w&1) -+ { -+ alpha = vld1_lane_u8((void*)mask,alpha,1); -+ dval = vld1q_lane_u16((void*)dst,dval,1); -+ } -+ -+ sval8temp = neon8mul(sval8,alpha); -+ temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3])))); -+ -+ if (w&1) -+ vst1q_lane_u16((void*)dst,temp,1); -+ if (w&2) -+ vst1q_lane_u32((void*)dst2,vreinterpretq_u32_u16(temp),1); -+ if (w&4) -+ vst1q_lane_u64((void*)dst4,vreinterpretq_u64_u16(temp),1); -+#else -+ asm volatile ( -+ "vdup.32 d0, %[src]\n\t" -+ "vdup.8 d1, d0[1]\n\t" -+ "vdup.8 d2, d0[2]\n\t" -+ "vdup.8 d3, d0[3]\n\t" -+ "vdup.8 d0, d0[0]\n\t" -+ -+ "tst %[w], #4\t\n" -+ "beq skip_load4\t\n" -+ -+ "vld1.64 {d25}, [%[dst]]\n\t" -+ "vld1.32 {d31[1]}, [%[mask]]\n\t" -+ "mov %[dst4], %[dst]\t\n" -+ "add %[mask], %[mask], #4\t\n" -+ "add %[dst], %[dst], #4*2\t\n" -+ -+ "skip_load4:\t\n" -+ "tst %[w], #2\t\n" -+ "beq skip_load2\t\n" -+ "vld1.32 {d24[1]}, [%[dst]]\n\t" -+ "vld1.16 {d31[1]}, [%[mask]]\n\t" -+ "mov %[dst2], %[dst]\t\n" -+ "add %[mask], %[mask], #2\t\n" -+ "add %[dst], %[dst], #2*2\t\n" -+ -+ "skip_load2:\t\n" -+ "tst %[w], #1\t\n" -+ "beq skip_load1\t\n" -+ "vld1.16 {d24[1]}, [%[dst]]\n\t" -+ "vld1.8 {d31[1]}, [%[mask]]\n\t" -+ -+ "skip_load1:\t\n" -+// expand 0565 q12 to 8888 {d4-d7} -+ "vmovn.u16 d4, q12\t\n" -+ "vshr.u16 q11, q12, #5\t\n" -+ "vshr.u16 q10, q12, #6+5\t\n" -+ "vmovn.u16 d5, q11\t\n" -+ "vmovn.u16 d6, q10\t\n" -+ "vshl.u8 d4, d4, #3\t\n" -+ "vshl.u8 d5, d5, #2\t\n" -+ "vshl.u8 d6, d6, #3\t\n" -+ "vsri.u8 d4, d4, #5\t\n" -+ "vsri.u8 d5, d5, #6\t\n" -+ "vsri.u8 d6, d6, #5\t\n" -+ -+ "vmull.u8 q10, d31, d0\n\t" -+ "vmull.u8 q11, d31, d1\n\t" -+ "vmull.u8 q12, d31, d2\n\t" -+ "vmull.u8 q13, d31, d3\n\t" -+ "vrshr.u16 q8, q10, #8\n\t" -+ "vrshr.u16 q9, q11, #8\n\t" -+ "vraddhn.u16 d20, q10, q8\n\t" -+ "vraddhn.u16 d21, q11, q9\n\t" -+ "vrshr.u16 q9, q13, #8\n\t" -+ "vrshr.u16 q8, q12, #8\n\t" -+ "vraddhn.u16 d23, q13, q9\n\t" -+ "vraddhn.u16 d22, q12, q8\n\t" -+ -+// duplicate in 4/2/1 & 8pix vsns -+ "vmvn.8 d30, d23\n\t" -+ "vmull.u8 q14, d30, d6\n\t" -+ "vmull.u8 q13, d30, d5\n\t" -+ "vmull.u8 q12, d30, d4\n\t" -+ "vrshr.u16 q8, q14, #8\n\t" -+ "vrshr.u16 q9, q13, #8\n\t" -+ "vraddhn.u16 d6, q14, q8\n\t" -+ "vrshr.u16 q8, q12, #8\n\t" -+ "vraddhn.u16 d5, q13, q9\n\t" -+ "vqadd.u8 d6, d6, d22\n\t" // moved up -+ "vraddhn.u16 d4, q12, q8\n\t" -+// intentionally don't calculate alpha -+// result in d4-d6 -+ -+// "vqadd.u8 d6, d6, d22\n\t" ** moved up -+ "vqadd.u8 d5, d5, d21\n\t" -+ "vqadd.u8 d4, d4, d20\n\t" -+ -+// pack 8888 {d20-d23} to 0565 q10 -+ "vshll.u8 q10, d6, #8\n\t" -+ "vshll.u8 q3, d5, #8\n\t" -+ "vshll.u8 q2, d4, #8\n\t" -+ "vsri.u16 q10, q3, #5\t\n" -+ "vsri.u16 q10, q2, #11\t\n" -+ -+ "tst %[w], #1\n\t" -+ "beq skip_store1\t\n" -+ "vst1.16 {d20[1]}, [%[dst]]\t\n" -+ "skip_store1:\t\n" -+ "tst %[w], #2\n\t" -+ "beq skip_store2\t\n" -+ "vst1.32 {d20[1]}, [%[dst2]]\t\n" -+ "skip_store2:\t\n" -+ "tst %[w], #4\n\t" -+ "beq skip_store4\t\n" -+ "vst1.16 {d21}, [%[dst4]]\t\n" -+ "skip_store4:\t\n" -+ -+ : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2) -+ : [src] "r" (src) -+ : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", -+ "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", -+ "d30","d31" -+ ); -+#endif -+ } -+ } -+} -+ -+ -+void -+fbCompositeSolidMask_nx8x8888neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ uint32_t src, srca; -+ uint32_t *dstLine, *dst; -+ uint8_t *maskLine, *mask; -+ int dstStride, maskStride; -+ uint32_t w; -+ uint8x8_t sval2; -+ uint8x8x4_t sval8; -+ uint8x8_t mask_selector=vreinterpret_u8_u64(vcreate_u64(0x0101010100000000ULL)); -+ uint8x8_t alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL)); -+ -+ fbComposeGetSolid(pSrc, src, pDst->bits.format); -+ -+ srca = src >> 24; -+ if (src == 0) -+ return; -+ -+ sval2=vreinterpret_u8_u32(vdup_n_u32(src)); -+ sval8.val[0]=vdup_lane_u8(sval2,0); -+ sval8.val[1]=vdup_lane_u8(sval2,1); -+ sval8.val[2]=vdup_lane_u8(sval2,2); -+ sval8.val[3]=vdup_lane_u8(sval2,3); -+ -+ fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); -+ fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); -+ -+ if (width>=8) -+ { -+ // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused -+ while (height--) -+ { -+ uint32_t *keep_dst; -+ -+ dst = dstLine; -+ dstLine += dstStride; -+ mask = maskLine; -+ maskLine += maskStride; -+ w = width; -+ -+#ifndef USE_NEON_INLINE_ASM -+ uint8x8_t alpha; -+ uint8x8x4_t dval, temp; -+ -+ alpha = vld1_u8((void*)mask); -+ dval = vld4_u8((void*)dst); -+ keep_dst = dst; -+ -+ temp = neon8mul(sval8,alpha); -+ dval = neon8mul(dval,vmvn_u8(temp.val[3])); -+ temp = neon8qadd(temp,dval); -+ -+ mask += (w & 7); -+ dst += (w & 7); -+ w -= (w & 7); -+ -+ while (w) -+ { -+ alpha = vld1_u8((void*)mask); -+ dval = vld4_u8((void*)dst); -+ -+ vst4_u8((void*)keep_dst,temp); -+ keep_dst = dst; -+ -+ temp = neon8mul(sval8,alpha); -+ dval = neon8mul(dval,vmvn_u8(temp.val[3])); -+ temp = neon8qadd(temp,dval); -+ -+ mask+=8; -+ dst+=8; -+ w-=8; -+ } -+ vst4_u8((void*)keep_dst,temp); -+#else -+ asm volatile ( -+ "vdup.32 d0, %[src]\n\t" -+ "vdup.8 d1, d0[1]\n\t" -+ "vdup.8 d2, d0[2]\n\t" -+ "vdup.8 d3, d0[3]\n\t" -+ "vdup.8 d0, d0[0]\n\t" -+ -+ "vld4.8 {d4-d7}, [%[dst]]\n\t" -+ "vld1.8 {d31}, [%[mask]]\n\t" -+ "mov %[keep_dst], %[dst]\n\t" -+ -+ "and ip, %[w], #7\n\t" -+ "add %[mask], %[mask], ip\n\t" -+ "add %[dst], %[dst], ip, LSL#2\n\t" -+ "subs %[w], %[w], ip\n\t" -+ "b 9f\n\t" -+// LOOP -+ "2:\n\t" -+ "vld4.8 {d4-d7}, [%[dst]]!\n\t" -+ "vld1.8 {d31}, [%[mask]]!\n\t" -+ "vst4.8 {d20-d23}, [%[keep_dst]]\n\t" -+ "sub %[keep_dst], %[dst], #8*4\n\t" -+ "subs %[w], %[w], #8\n\t" -+ "9:\n\t" -+ -+ "vmull.u8 q10, d31, d0\n\t" -+ "vmull.u8 q11, d31, d1\n\t" -+ "vmull.u8 q12, d31, d2\n\t" -+ "vmull.u8 q13, d31, d3\n\t" -+ "vrshr.u16 q8, q10, #8\n\t" -+ "vrshr.u16 q9, q11, #8\n\t" -+ "vraddhn.u16 d20, q10, q8\n\t" -+ "vraddhn.u16 d21, q11, q9\n\t" -+ "vrshr.u16 q9, q13, #8\n\t" -+ "vrshr.u16 q8, q12, #8\n\t" -+ "vraddhn.u16 d23, q13, q9\n\t" -+ "vraddhn.u16 d22, q12, q8\n\t" -+ -+ "vmvn.8 d30, d23\n\t" -+ "vmull.u8 q12, d30, d4\n\t" -+ "vmull.u8 q13, d30, d5\n\t" -+ "vmull.u8 q14, d30, d6\n\t" -+ "vmull.u8 q15, d30, d7\n\t" -+ -+ "vrshr.u16 q8, q12, #8\n\t" -+ "vrshr.u16 q9, q13, #8\n\t" -+ "vraddhn.u16 d4, q12, q8\n\t" -+ "vrshr.u16 q8, q14, #8\n\t" -+ "vraddhn.u16 d5, q13, q9\n\t" -+ "vrshr.u16 q9, q15, #8\n\t" -+ "vraddhn.u16 d6, q14, q8\n\t" -+ "vraddhn.u16 d7, q15, q9\n\t" -+// result in d4-d7 -+ -+ "vqadd.u8 d20, d4, d20\n\t" -+ "vqadd.u8 d21, d5, d21\n\t" -+ "vqadd.u8 d22, d6, d22\n\t" -+ "vqadd.u8 d23, d7, d23\n\t" -+ -+ "bne 2b\n\t" -+ -+ "1:\n\t" -+ "vst4.8 {d20-d23}, [%[keep_dst]]\n\t" -+ -+ : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "+r" (keep_dst) -+ : [src] "r" (src) -+ : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", -+ "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", -+ "d30","d31" -+ ); -+#endif -+ } -+ } -+ else -+ { -+ while (height--) -+ { -+ uint8x8_t alpha; -+ -+ dst = dstLine; -+ dstLine += dstStride; -+ mask = maskLine; -+ maskLine += maskStride; -+ w = width; -+ -+ while (w>=2) -+ { -+ uint8x8_t dval, temp, res; -+ -+ alpha = vtbl1_u8(vreinterpret_u8_u16(vld1_dup_u16((void*)mask)), mask_selector); -+ dval = vld1_u8((void*)dst); -+ -+ temp = neon2mul(sval2,alpha); -+ res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector))); -+ -+ vst1_u8((void*)dst,res); -+ -+ mask+=2; -+ dst+=2; -+ w-=2; -+ } -+ if (w) -+ { -+ uint8x8_t dval, temp, res; -+ -+ alpha = vtbl1_u8(vld1_dup_u8((void*)mask), mask_selector); -+ dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst)); -+ -+ temp = neon2mul(sval2,alpha); -+ res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector))); -+ -+ vst1_lane_u32((void*)dst,vreinterpret_u32_u8(res),0); -+ } -+ } -+ } -+} -+ -+ -+void -+fbCompositeSrcAdd_8888x8x8neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ uint8_t *dstLine, *dst; -+ uint8_t *maskLine, *mask; -+ int dstStride, maskStride; -+ uint32_t w; -+ uint32_t src; -+ uint8x8_t sa; -+ -+ fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); -+ fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); -+ fbComposeGetSolid (pSrc, src, pDst->bits.format); -+ sa = vdup_n_u8((src) >> 24); -+ -+ if (width>=8) -+ { -+ // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ mask = maskLine; -+ maskLine += maskStride; -+ w = width; -+ -+ uint8x8_t mval, dval, res; -+ uint8_t *keep_dst; -+ -+ mval = vld1_u8((void *)mask); -+ dval = vld1_u8((void *)dst); -+ keep_dst = dst; -+ -+ res = vqadd_u8(neon2mul(mval,sa),dval); -+ -+ mask += (w & 7); -+ dst += (w & 7); -+ w -= w & 7; -+ -+ while (w) -+ { -+ mval = vld1_u8((void *)mask); -+ dval = vld1_u8((void *)dst); -+ vst1_u8((void *)keep_dst, res); -+ keep_dst = dst; -+ -+ res = vqadd_u8(neon2mul(mval,sa),dval); -+ -+ mask += 8; -+ dst += 8; -+ w -= 8; -+ } -+ vst1_u8((void *)keep_dst, res); -+ } -+ } -+ else -+ { -+ // Use 4/2/1 load/store method to handle 1-7 pixels -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ mask = maskLine; -+ maskLine += maskStride; -+ w = width; -+ -+ uint8x8_t mval, dval, res; -+ uint8_t *dst4, *dst2; -+ -+ if (w&4) -+ { -+ mval = vreinterpret_u8_u32(vld1_lane_u32((void *)mask, vreinterpret_u32_u8(mval), 1)); -+ dval = vreinterpret_u8_u32(vld1_lane_u32((void *)dst, vreinterpret_u32_u8(dval), 1)); -+ -+ dst4 = dst; -+ mask += 4; -+ dst += 4; -+ } -+ if (w&2) -+ { -+ mval = vreinterpret_u8_u16(vld1_lane_u16((void *)mask, vreinterpret_u16_u8(mval), 1)); -+ dval = vreinterpret_u8_u16(vld1_lane_u16((void *)dst, vreinterpret_u16_u8(dval), 1)); -+ dst2 = dst; -+ mask += 2; -+ dst += 2; -+ } -+ if (w&1) -+ { -+ mval = vld1_lane_u8((void *)mask, mval, 1); -+ dval = vld1_lane_u8((void *)dst, dval, 1); -+ } -+ -+ res = vqadd_u8(neon2mul(mval,sa),dval); -+ -+ if (w&1) -+ vst1_lane_u8((void *)dst, res, 1); -+ if (w&2) -+ vst1_lane_u16((void *)dst2, vreinterpret_u16_u8(res), 1); -+ if (w&4) -+ vst1_lane_u32((void *)dst4, vreinterpret_u32_u8(res), 1); -+ } -+ } -+} -+ -diff --git a/pixman/pixman-arm-neon.h b/pixman/pixman-arm-neon.h -new file mode 100644 -index 0000000..bab4dee ---- /dev/null -+++ b/pixman/pixman-arm-neon.h -@@ -0,0 +1,137 @@ -+/* -+ * Copyright © 2009 Mozilla Corporation -+ * -+ * Permission to use, copy, modify, distribute, and sell this software and its -+ * documentation for any purpose is hereby granted without fee, provided that -+ * the above copyright notice appear in all copies and that both that -+ * copyright notice and this permission notice appear in supporting -+ * documentation, and that the name of Mozilla Corporation not be used in -+ * advertising or publicity pertaining to distribution of the software without -+ * specific, written prior permission. Mozilla Corporation makes no -+ * representations about the suitability of this software for any purpose. It -+ * is provided "as is" without express or implied warranty. -+ * -+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS -+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND -+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY -+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN -+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING -+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS -+ * SOFTWARE. -+ * -+ * Author: Ian Rickards (ian.rickards@arm.com) -+ * -+ */ -+ -+#include "pixman-private.h" -+ -+#ifdef USE_ARM_NEON -+ -+static inline pixman_bool_t pixman_have_arm_neon(void) { return TRUE; } -+ -+#else -+#define pixman_have_arm_neon() FALSE -+#endif -+ -+#ifdef USE_ARM_NEON -+ -+void -+fbCompositeSrcAdd_8000x8000neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height); -+ -+void -+fbCompositeSrc_8888x8888neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height); -+ -+void -+fbCompositeSrc_8888x8x8888neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height); -+ -+void -+fbCompositeSolidMask_nx8x0565neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height); -+ -+void -+fbCompositeSolidMask_nx8x8888neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height); -+ -+void -+fbCompositeSrc_x888x0565neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height); -+ -+void -+fbCompositeSrcAdd_8888x8x8neon (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height); -+ -+#endif /* USE_ARM_NEON */ -diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c -index 1388517..b13947a 100644 ---- a/pixman/pixman-pict.c -+++ b/pixman/pixman-pict.c -@@ -34,6 +34,7 @@ - #include "pixman-mmx.h" - #include "pixman-vmx.h" - #include "pixman-sse2.h" -+#include "pixman-arm-neon.h" - #include "pixman-arm-simd.h" - #include "pixman-combine32.h" - -@@ -1518,6 +1519,31 @@ static const FastPathInfo vmx_fast_paths[] = - }; - #endif - -+#ifdef USE_ARM_NEON -+static const FastPathInfo arm_neon_fast_paths[] = -+{ -+ { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, fbCompositeSrcAdd_8888x8x8neon, 0 }, -+ { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, fbCompositeSrcAdd_8000x8000neon, 0 }, -+ { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_x888x0565neon, 0 }, -+ { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_x888x0565neon, 0 }, -+ { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_x888x0565neon, 0 }, -+ { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_x888x0565neon, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888neon, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888neon, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888neon, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888neon, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888neon, NEED_SOLID_MASK }, -+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888neon, NEED_SOLID_MASK }, -+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, fbCompositeSolidMask_nx8x0565neon, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, fbCompositeSolidMask_nx8x0565neon, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888neon, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888neon, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888neon, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888neon, 0 }, -+ { PIXMAN_OP_NONE }, -+}; -+#endif -+ - #ifdef USE_ARM_SIMD - static const FastPathInfo arm_simd_fast_paths[] = - { -@@ -1893,6 +1919,11 @@ pixman_image_composite (pixman_op_t op, - info = get_fast_path (vmx_fast_paths, op, pSrc, pMask, pDst, pixbuf); - #endif - -+#ifdef USE_ARM_NEON -+ if (!info && pixman_have_arm_neon()) -+ info = get_fast_path (arm_neon_fast_paths, op, pSrc, pMask, pDst, pixbuf); -+#endif -+ - #ifdef USE_ARM_SIMD - if (!info && pixman_have_arm_simd()) - info = get_fast_path (arm_simd_fast_paths, op, pSrc, pMask, pDst, pixbuf); diff --git a/recipes/xorg-lib/pixman/pixman-28986.patch b/recipes/xorg-lib/pixman/pixman-28986.patch deleted file mode 100644 index f5ba4c302e..0000000000 --- a/recipes/xorg-lib/pixman/pixman-28986.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 7b7860d61fb1526acdf010dd8fd644bbf1396b9e Mon Sep 17 00:00:00 2001 -From: Siarhei Siamashka <siarhei.siamashka@nokia.com> -Date: Fri, 28 Aug 2009 22:34:21 +0300 -Subject: [PATCH] ARM: workaround for gcc bug in vshll_n_u8 intrinsic - -Some versions of gcc (cs2009q1, 4.4.1) incorrectly reject -shift operand having value >= 8, claiming that it is out of -range. So inline assembly is used as a workaround. ---- - pixman/pixman-arm-neon.c | 6 ++++++ - 1 files changed, 6 insertions(+), 0 deletions(-) - -diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c -index 4125d1b..3e7f566 100644 ---- a/pixman/pixman-arm-neon.c -+++ b/pixman/pixman-arm-neon.c -@@ -64,6 +64,12 @@ unpack0565 (uint16x8_t rgb) - return res; - } - -+#ifdef USE_GCC_INLINE_ASM -+/* Some versions of gcc have problems with vshll_n_u8 intrinsic (Bug 23576) */ -+#define vshll_n_u8(a, n) ({ uint16x8_t r; \ -+ asm ("vshll.u8 %q0, %P1, %2\n" : "=w" (r) : "w" (a), "i" (n)); r; }) -+#endif -+ - static force_inline uint16x8_t - pack0565 (uint8x8x4_t s) - { --- -1.5.4.3 - diff --git a/recipes/xorg-lib/pixman/pixman-arm.patch b/recipes/xorg-lib/pixman/pixman-arm.patch deleted file mode 100644 index 91dda03b7c..0000000000 --- a/recipes/xorg-lib/pixman/pixman-arm.patch +++ /dev/null @@ -1,632 +0,0 @@ -From: Jeff Muizelaar <jmuizelaar@mozilla.com> -Date: Wed, 17 Sep 2008 19:53:20 +0000 (-0400) -Subject: Add support for ARMv6 SIMD fastpaths. -X-Git-Url: http://gitweb.freedesktop.org/?p=pixman.git;a=commitdiff;h=d0b181f347ef4720d130beee3f03196afbd28aba - -Add support for ARMv6 SIMD fastpaths. ---- - ---- a/configure.ac -+++ b/configure.ac -@@ -277,6 +277,44 @@ AC_SUBST(VMX_CFLAGS) - - AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes) - -+dnl Check for ARM -+ -+have_armv6_simd=no -+AC_MSG_CHECKING(whether to use ARM assembler) -+xserver_save_CFLAGS=$CFLAGS -+CFLAGS="$CFLAGS $ARM_CFLAGS" -+AC_COMPILE_IFELSE([ -+int main () { -+ asm("uqadd8 r1, r1, r2"); -+ return 0; -+}], have_armv6_simd=yes) -+CFLAGS=$xserver_save_CFLAGS -+ -+AC_ARG_ENABLE(arm, -+ [AC_HELP_STRING([--disable-arm], -+ [disable ARM fast paths])], -+ [enable_arm=$enableval], [enable_arm=auto]) -+ -+if test $enable_arm = no ; then -+ have_armv6_simd=disabled -+fi -+ -+if test $have_armv6_simd = yes ; then -+ AC_DEFINE(USE_ARM, 1, [use ARM compiler intrinsics]) -+else -+ ARM_CFLAGS= -+fi -+ -+AC_MSG_RESULT($have_armv6_simd) -+if test $enable_arm = yes && test $have_armv6_simd = no ; then -+ AC_MSG_ERROR([ARM intrinsics not detected]) -+fi -+ -+AC_SUBST(ARM_CFLAGS) -+ -+AM_CONDITIONAL(USE_ARM, test $have_armv6_simd = yes) -+ -+ - AC_ARG_ENABLE(gtk, - [AC_HELP_STRING([--enable-gtk], - [enable tests using GTK+ [default=auto]])], ---- a/pixman/Makefile.am -+++ b/pixman/Makefile.am -@@ -79,3 +79,15 @@ libpixman_sse2_la_LIBADD = $(DEP_LIBS) - libpixman_1_la_LIBADD += libpixman-sse2.la - endif - -+# arm code -+if USE_ARM -+noinst_LTLIBRARIES += libpixman-arm.la -+libpixman_arm_la_SOURCES = \ -+ pixman-arm.c \ -+ pixman-arm.h -+libpixman_arm_la_CFLAGS = $(DEP_CFLAGS) $(ARM_CFLAGS) -+libpixman_arm_la_LIBADD = $(DEP_LIBS) -+libpixman_1_la_LIBADD += libpixman-arm.la -+endif -+ -+ ---- /dev/null -+++ b/pixman/pixman-arm.c -@@ -0,0 +1,409 @@ -+/* -+ * Copyright © 2008 Mozilla Corporation -+ * -+ * Permission to use, copy, modify, distribute, and sell this software and its -+ * documentation for any purpose is hereby granted without fee, provided that -+ * the above copyright notice appear in all copies and that both that -+ * copyright notice and this permission notice appear in supporting -+ * documentation, and that the name of Mozilla Corporation not be used in -+ * advertising or publicity pertaining to distribution of the software without -+ * specific, written prior permission. Mozilla Corporation makes no -+ * representations about the suitability of this software for any purpose. It -+ * is provided "as is" without express or implied warranty. -+ * -+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS -+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND -+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY -+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN -+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING -+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS -+ * SOFTWARE. -+ * -+ * Author: Jeff Muizelaar (jeff@infidigm.net) -+ * -+ */ -+#ifdef HAVE_CONFIG_H -+#include <config.h> -+#endif -+ -+#include "pixman-arm.h" -+ -+void -+fbCompositeSrcAdd_8000x8000arm (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ uint8_t *dstLine, *dst; -+ uint8_t *srcLine, *src; -+ int dstStride, srcStride; -+ uint16_t w; -+ uint8_t s, d; -+ -+ fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1); -+ fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); -+ -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ src = srcLine; -+ srcLine += srcStride; -+ w = width; -+ -+ while (w && (unsigned long)dst & 3) -+ { -+ s = *src; -+ d = *dst; -+ asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s)); -+ *dst = d; -+ -+ dst++; -+ src++; -+ w--; -+ } -+ -+ while (w >= 4) -+ { -+ asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst)); -+ dst += 4; -+ src += 4; -+ w -= 4; -+ } -+ -+ while (w) -+ { -+ s = *src; -+ d = *dst; -+ asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s)); -+ *dst = d; -+ -+ dst++; -+ src++; -+ w--; -+ } -+ } -+ -+} -+ -+void -+fbCompositeSrc_8888x8888arm (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ uint32_t *dstLine, *dst; -+ uint32_t *srcLine, *src; -+ int dstStride, srcStride; -+ uint16_t w; -+ uint32_t component_half = 0x800080; -+ uint32_t upper_component_mask = 0xff00ff00; -+ uint32_t alpha_mask = 0xff; -+ -+ fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); -+ fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); -+ -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ src = srcLine; -+ srcLine += srcStride; -+ w = width; -+ -+//#define inner_branch -+ asm volatile ( -+ "cmp %[w], #0\n\t" -+ "beq 2f\n\t" -+ "1:\n\t" -+ /* load dest */ -+ "ldr r5, [%[src]], #4\n\t" -+#ifdef inner_branch -+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. -+ * The 0x0 case also allows us to avoid doing an unecessary data -+ * write which is more valuable so we only check for that */ -+ "cmp r5, #0x1000000\n\t" -+ "blt 3f\n\t" -+ -+ /* = 255 - alpha */ -+ "sub r8, %[alpha_mask], r5, lsr #24\n\t" -+ -+ "ldr r4, [%[dest]] \n\t" -+ -+#else -+ "ldr r4, [%[dest]] \n\t" -+ -+ /* = 255 - alpha */ -+ "sub r8, %[alpha_mask], r5, lsr #24\n\t" -+#endif -+ "uxtb16 r6, r4\n\t" -+ "uxtb16 r7, r4, ror #8\n\t" -+ -+ /* multiply by 257 and divide by 65536 */ -+ "mla r6, r6, r8, %[component_half]\n\t" -+ "mla r7, r7, r8, %[component_half]\n\t" -+ -+ "uxtab16 r6, r6, r6, ror #8\n\t" -+ "uxtab16 r7, r7, r7, ror #8\n\t" -+ -+ /* recombine the 0xff00ff00 bytes of r6 and r7 */ -+ "and r7, %[upper_component_mask]\n\t" -+ "uxtab16 r6, r7, r6, ror #8\n\t" -+ -+ "uqadd8 r5, r6, r5\n\t" -+ -+#ifdef inner_branch -+ "3:\n\t" -+ -+#endif -+ "str r5, [%[dest]], #4\n\t" -+ /* increment counter and jmp to top */ -+ "subs %[w], %[w], #1\n\t" -+ "bne 1b\n\t" -+ "2:\n\t" -+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) -+ : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask), -+ [alpha_mask] "r" (alpha_mask) -+ : "r4", "r5", "r6", "r7", "r8", "cc", "memory" -+ ); -+ } -+} -+ -+void -+fbCompositeSrc_8888x8x8888arm (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ uint32_t *dstLine, *dst; -+ uint32_t *srcLine, *src; -+ uint32_t mask; -+ int dstStride, srcStride; -+ uint16_t w; -+ uint32_t component_half = 0x800080; -+ uint32_t alpha_mask = 0xff; -+ -+ fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); -+ fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); -+ -+ fbComposeGetSolid (pMask, mask, pDst->bits.format); -+ mask = (mask) >> 24; -+ -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ src = srcLine; -+ srcLine += srcStride; -+ w = width; -+ -+//#define inner_branch -+ asm volatile ( -+ "cmp %[w], #0\n\t" -+ "beq 2f\n\t" -+ "1:\n\t" -+ /* load dest */ -+ "ldr r5, [%[src]], #4\n\t" -+#ifdef inner_branch -+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. -+ * The 0x0 case also allows us to avoid doing an unecessary data -+ * write which is more valuable so we only check for that */ -+ "cmp r5, #0x1000000\n\t" -+ "blt 3f\n\t" -+ -+#endif -+ "ldr r4, [%[dest]] \n\t" -+ -+ "uxtb16 r6, r5\n\t" -+ "uxtb16 r7, r5, ror #8\n\t" -+ -+ /* multiply by alpha (r8) then by 257 and divide by 65536 */ -+ "mla r6, r6, %[mask_alpha], %[component_half]\n\t" -+ "mla r7, r7, %[mask_alpha], %[component_half]\n\t" -+ -+ "uxtab16 r6, r6, r6, ror #8\n\t" -+ "uxtab16 r7, r7, r7, ror #8\n\t" -+ -+ "uxtb16 r6, r6, ror #8\n\t" -+ "uxtb16 r7, r7, ror #8\n\t" -+ -+ /* recombine */ -+ "orr r5, r6, r7, lsl #8\n\t" -+ -+ "uxtb16 r6, r4\n\t" -+ "uxtb16 r7, r4, ror #8\n\t" -+ -+ /* 255 - alpha */ -+ "sub r8, %[alpha_mask], r5, lsr #24\n\t" -+ -+ /* multiply by alpha (r8) then by 257 and divide by 65536 */ -+ "mla r6, r6, r8, %[component_half]\n\t" -+ "mla r7, r7, r8, %[component_half]\n\t" -+ -+ "uxtab16 r6, r6, r6, ror #8\n\t" -+ "uxtab16 r7, r7, r7, ror #8\n\t" -+ -+ "uxtb16 r6, r6, ror #8\n\t" -+ "uxtb16 r7, r7, ror #8\n\t" -+ -+ /* recombine */ -+ "orr r6, r6, r7, lsl #8\n\t" -+ -+ "uqadd8 r5, r6, r5\n\t" -+ -+#ifdef inner_branch -+ "3:\n\t" -+ -+#endif -+ "str r5, [%[dest]], #4\n\t" -+ /* increment counter and jmp to top */ -+ "subs %[w], %[w], #1\n\t" -+ "bne 1b\n\t" -+ "2:\n\t" -+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) -+ : [component_half] "r" (component_half), [mask_alpha] "r" (mask), -+ [alpha_mask] "r" (alpha_mask) -+ : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory" -+ ); -+ } -+} -+ -+void -+fbCompositeSolidMask_nx8x8888arm (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ uint32_t src, srca; -+ uint32_t *dstLine, *dst; -+ uint8_t *maskLine, *mask; -+ int dstStride, maskStride; -+ uint16_t w; -+ -+ fbComposeGetSolid(pSrc, src, pDst->bits.format); -+ -+ srca = src >> 24; -+ if (src == 0) -+ return; -+ -+ uint32_t component_mask = 0xff00ff; -+ uint32_t component_half = 0x800080; -+ -+ uint32_t src_hi = (src >> 8) & component_mask; -+ uint32_t src_lo = src & component_mask; -+ -+ fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); -+ fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); -+ -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ mask = maskLine; -+ maskLine += maskStride; -+ w = width; -+ -+//#define inner_branch -+ asm volatile ( -+ "cmp %[w], #0\n\t" -+ "beq 2f\n\t" -+ "1:\n\t" -+ /* load mask */ -+ "ldrb r5, [%[mask]], #1\n\t" -+#ifdef inner_branch -+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. -+ * The 0x0 case also allows us to avoid doing an unecessary data -+ * write which is more valuable so we only check for that */ -+ /* 0x1000000 is the least value that contains alpha all values -+ * less than it have a 0 alpha value */ -+ "cmp r5, #0x0\n\t" -+ "beq 3f\n\t" -+ -+#endif -+ "ldr r4, [%[dest]] \n\t" -+ -+ /* multiply by alpha (r8) then by 257 and divide by 65536 */ -+ "mla r6, %[src_lo], r5, %[component_half]\n\t" -+ "mla r7, %[src_hi], r5, %[component_half]\n\t" -+ -+ "uxtab16 r6, r6, r6, ror #8\n\t" -+ "uxtab16 r7, r7, r7, ror #8\n\t" -+ -+ "uxtb16 r6, r6, ror #8\n\t" -+ "uxtb16 r7, r7, ror #8\n\t" -+ -+ /* recombine */ -+ "orr r5, r6, r7, lsl #8\n\t" -+ -+ "uxtb16 r6, r4\n\t" -+ "uxtb16 r7, r4, ror #8\n\t" -+ -+ /* we could simplify this to use 'sub' if we were -+ * willing to give up a register for alpha_mask */ -+ "mvn r8, r5\n\t" -+ "mov r8, r8, lsr #24\n\t" -+ -+ /* multiply by alpha (r8) then by 257 and divide by 65536 */ -+ "mla r6, r6, r8, %[component_half]\n\t" -+ "mla r7, r7, r8, %[component_half]\n\t" -+ -+ "uxtab16 r6, r6, r6, ror #8\n\t" -+ "uxtab16 r7, r7, r7, ror #8\n\t" -+ -+ "uxtb16 r6, r6, ror #8\n\t" -+ "uxtb16 r7, r7, ror #8\n\t" -+ -+ /* recombine */ -+ "orr r6, r6, r7, lsl #8\n\t" -+ -+ "uqadd8 r5, r6, r5\n\t" -+ -+#ifdef inner_branch -+ "3:\n\t" -+ -+#endif -+ "str r5, [%[dest]], #4\n\t" -+ /* increment counter and jmp to top */ -+ "subs %[w], %[w], #1\n\t" -+ "bne 1b\n\t" -+ "2:\n\t" -+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask) -+ : [component_half] "r" (component_half), -+ [src_hi] "r" (src_hi), [src_lo] "r" (src_lo) -+ : "r4", "r5", "r6", "r7", "r8", "cc", "memory" -+ ); -+ } -+} ---- /dev/null -+++ b/pixman/pixman-arm.h -@@ -0,0 +1,94 @@ -+/* -+ * Copyright © 2008 Mozilla Corporation -+ * -+ * Permission to use, copy, modify, distribute, and sell this software and its -+ * documentation for any purpose is hereby granted without fee, provided that -+ * the above copyright notice appear in all copies and that both that -+ * copyright notice and this permission notice appear in supporting -+ * documentation, and that the name of Mozilla Corporation not be used in -+ * advertising or publicity pertaining to distribution of the software without -+ * specific, written prior permission. Mozilla Corporation makes no -+ * representations about the suitability of this software for any purpose. It -+ * is provided "as is" without express or implied warranty. -+ * -+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS -+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND -+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY -+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN -+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING -+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS -+ * SOFTWARE. -+ * -+ * Author: Jeff Muizelaar (jeff@infidigm.net) -+ * -+ */ -+ -+#include "pixman-private.h" -+ -+#ifdef USE_ARM -+ -+static inline pixman_bool_t pixman_have_arm(void) { return TRUE; } -+ -+#else -+#define pixman_have_arm() FALSE -+#endif -+ -+#ifdef USE_ARM -+ -+void -+fbCompositeSrcAdd_8000x8000arm (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height); -+void -+fbCompositeSrc_8888x8888arm (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height); -+ -+void -+fbCompositeSrc_8888x8x8888arm (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height); -+void -+fbCompositeSolidMask_nx8x8888arm (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height); -+ -+ -+#endif /* USE_ARM */ ---- a/pixman/pixman-pict.c -+++ b/pixman/pixman-pict.c -@@ -34,6 +34,7 @@ - #include "pixman-mmx.h" - #include "pixman-vmx.h" - #include "pixman-sse2.h" -+#include "pixman-arm.h" - #include "pixman-combine32.h" - - #ifdef __GNUC__ -@@ -1479,6 +1480,26 @@ static const FastPathInfo vmx_fast_paths - }; - #endif - -+#ifdef USE_ARM -+static const FastPathInfo arm_fast_paths[] = -+{ -+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888arm, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888arm, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888arm, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888arm, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888arm, NEED_SOLID_MASK }, -+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888arm, NEED_SOLID_MASK }, -+ -+ { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, fbCompositeSrcAdd_8000x8000arm, 0 }, -+ -+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888arm, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888arm, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888arm, 0 }, -+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888arm, 0 }, -+ -+ { PIXMAN_OP_NONE }, -+}; -+#endif - - static const FastPathInfo c_fast_paths[] = - { -@@ -1829,6 +1850,12 @@ pixman_image_composite (pixman_op_t - if (!info && pixman_have_vmx()) - info = get_fast_path (vmx_fast_paths, op, pSrc, pMask, pDst, pixbuf); - #endif -+ -+#ifdef USE_ARM -+ if (!info && pixman_have_arm()) -+ info = get_fast_path (arm_fast_paths, op, pSrc, pMask, pDst, pixbuf); -+#endif -+ - if (!info) - info = get_fast_path (c_fast_paths, op, pSrc, pMask, pDst, pixbuf); - diff --git a/recipes/xorg-lib/pixman/pixman-x888-565.patch b/recipes/xorg-lib/pixman/pixman-x888-565.patch deleted file mode 100644 index a3fa331710..0000000000 --- a/recipes/xorg-lib/pixman/pixman-x888-565.patch +++ /dev/null @@ -1,68 +0,0 @@ -From: Vladimir Vukicevic <vladimir@slide.(none)> -Date: Wed, 17 Sep 2008 20:01:31 +0000 (-0400) -Subject: Add SRC x888x0565 C fast path -X-Git-Url: http://gitweb.freedesktop.org/?p=pixman.git;a=commitdiff;h=7180230d4d87c55dfef1e17a0cc3b125d45aa3a0 - -Add SRC x888x0565 C fast path ---- - ---- a/pixman/pixman-pict.c -+++ b/pixman/pixman-pict.c -@@ -759,6 +759,46 @@ fbCompositeSrc_8888x0565 (pixman_op_t op - } - } - -+ -+void -+fbCompositeSrc_x888x0565 (pixman_op_t op, -+ pixman_image_t * pSrc, -+ pixman_image_t * pMask, -+ pixman_image_t * pDst, -+ int16_t xSrc, -+ int16_t ySrc, -+ int16_t xMask, -+ int16_t yMask, -+ int16_t xDst, -+ int16_t yDst, -+ uint16_t width, -+ uint16_t height) -+{ -+ uint16_t *dstLine, *dst; -+ uint32_t *srcLine, *src, s; -+ int dstStride, srcStride; -+ uint16_t w; -+ -+ fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); -+ fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); -+ -+ while (height--) -+ { -+ dst = dstLine; -+ dstLine += dstStride; -+ src = srcLine; -+ srcLine += srcStride; -+ w = width; -+ -+ while (w--) -+ { -+ s = READ(pSrc, src++); -+ WRITE(pDst, dst, cvt8888to0565(s)); -+ dst++; -+ } -+ } -+} -+ - void - fbCompositeSrcAdd_8000x8000 (pixman_op_t op, - pixman_image_t * pSrc, -@@ -1568,6 +1608,10 @@ static const FastPathInfo c_fast_paths[] - { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrcSrc_nxn, 0 }, - { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrcSrc_nxn, 0 }, - #endif -+ { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_x888x0565, 0 }, -+ { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_x888x0565, 0 }, -+ { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_x888x0565, 0 }, -+ { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_x888x0565, 0 }, - { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, fbCompositeSrcIn_8x8, 0 }, - { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, fbCompositeSolidMaskIn_nx8x8, 0 }, - { PIXMAN_OP_NONE }, diff --git a/recipes/xorg-lib/pixman/prefetch.patch b/recipes/xorg-lib/pixman/prefetch.patch deleted file mode 100644 index c2e856ec25..0000000000 --- a/recipes/xorg-lib/pixman/prefetch.patch +++ /dev/null @@ -1,298 +0,0 @@ -From d0044bfbd596f22ed1560579ea6537b39f3dc1af Mon Sep 17 00:00:00 2001 -From: Siarhei Siamashka <siarhei.siamashka@nokia.com> -Date: Thu, 29 Oct 2009 19:06:42 +0000 -Subject: ARM: Don't emit prefetch code if prefetch distance is set to 0 - -Also it is now possible to disable prefetch globally with -a configuration macro ---- -diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S -index bca499a..35e6a7e 100644 ---- a/pixman/pixman-arm-neon-asm.S -+++ b/pixman/pixman-arm-neon-asm.S -@@ -219,33 +219,33 @@ - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vshll.u8 q14, d16, #8 -- add PF_X, PF_X, #8 -+ PF add PF_X, PF_X, #8 - vshll.u8 q8, d19, #8 -- tst PF_CTL, #0xF -+ PF tst PF_CTL, #0xF - vsri.u8 d6, d6, #5 -- addne PF_X, PF_X, #8 -+ PF addne PF_X, PF_X, #8 - vmvn.8 d3, d3 -- subne PF_CTL, PF_CTL, #1 -+ PF subne PF_CTL, PF_CTL, #1 - vsri.u8 d7, d7, #6 - vshrn.u16 d30, q2, #2 - vmull.u8 q10, d3, d6 -- pld [PF_SRC, PF_X, lsl #src_bpp_shift] -+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmull.u8 q11, d3, d7 - vmull.u8 q12, d3, d30 -- pld [PF_DST, PF_X, lsl #dst_bpp_shift] -+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vsri.u16 q14, q8, #5 -- cmp PF_X, ORIG_W -+ PF cmp PF_X, ORIG_W - vshll.u8 q9, d18, #8 - vrshr.u16 q13, q10, #8 -- subge PF_X, PF_X, ORIG_W -+ PF subge PF_X, PF_X, ORIG_W - vrshr.u16 q3, q11, #8 - vrshr.u16 q15, q12, #8 -- subges PF_CTL, PF_CTL, #0x10 -+ PF subges PF_CTL, PF_CTL, #0x10 - vsri.u16 q14, q9, #11 -- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vraddhn.u16 d20, q10, q13 - vraddhn.u16 d23, q11, q3 -- ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vraddhn.u16 d22, q12, q15 - vst1.16 {d28, d29}, [DST_W, :128]! - .endm -@@ -323,20 +323,20 @@ generate_composite_function \ - - .macro pixman_composite_src_8888_0565_process_pixblock_tail_head - vsri.u16 q14, q8, #5 -- add PF_X, PF_X, #8 -- tst PF_CTL, #0xF -+ PF add PF_X, PF_X, #8 -+ PF tst PF_CTL, #0xF - vld4.8 {d0, d1, d2, d3}, [SRC]! -- addne PF_X, PF_X, #8 -- subne PF_CTL, PF_CTL, #1 -+ PF addne PF_X, PF_X, #8 -+ PF subne PF_CTL, PF_CTL, #1 - vsri.u16 q14, q9, #11 -- cmp PF_X, ORIG_W -- pld [PF_SRC, PF_X, lsl #src_bpp_shift] -+ PF cmp PF_X, ORIG_W -+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vshll.u8 q8, d1, #8 - vst1.16 {d28, d29}, [DST_W, :128]! -- subge PF_X, PF_X, ORIG_W -- subges PF_CTL, PF_CTL, #0x10 -+ PF subge PF_X, PF_X, ORIG_W -+ PF subges PF_CTL, PF_CTL, #0x10 - vshll.u8 q14, d2, #8 -- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vshll.u8 q9, d0, #8 - .endm - -@@ -363,20 +363,20 @@ generate_composite_function \ - - .macro pixman_composite_add_8000_8000_process_pixblock_tail_head - vld1.8 {d0, d1, d2, d3}, [SRC]! -- add PF_X, PF_X, #32 -- tst PF_CTL, #0xF -+ PF add PF_X, PF_X, #32 -+ PF tst PF_CTL, #0xF - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! -- addne PF_X, PF_X, #32 -- subne PF_CTL, PF_CTL, #1 -+ PF addne PF_X, PF_X, #32 -+ PF subne PF_CTL, PF_CTL, #1 - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! -- cmp PF_X, ORIG_W -- pld [PF_SRC, PF_X, lsl #src_bpp_shift] -- pld [PF_DST, PF_X, lsl #dst_bpp_shift] -- subge PF_X, PF_X, ORIG_W -- subges PF_CTL, PF_CTL, #0x10 -+ PF cmp PF_X, ORIG_W -+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] -+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] -+ PF subge PF_X, PF_X, ORIG_W -+ PF subges PF_CTL, PF_CTL, #0x10 - vqadd.u8 q14, q0, q2 -- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -- ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vqadd.u8 q15, q1, q3 - .endm - -@@ -418,32 +418,32 @@ generate_composite_function \ - .macro pixman_composite_over_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q14, q8, #8 -- add PF_X, PF_X, #8 -- tst PF_CTL, #0xF -+ PF add PF_X, PF_X, #8 -+ PF tst PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 -- addne PF_X, PF_X, #8 -- subne PF_CTL, PF_CTL, #1 -+ PF addne PF_X, PF_X, #8 -+ PF subne PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 -- cmp PF_X, ORIG_W -+ PF cmp PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 - vld4.8 {d0, d1, d2, d3}, [SRC]! -- pld [PF_SRC, PF_X, lsl #src_bpp_shift] -+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmvn.8 d22, d3 -- pld [PF_DST, PF_X, lsl #dst_bpp_shift] -+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -- subge PF_X, PF_X, ORIG_W -+ PF subge PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 -- subges PF_CTL, PF_CTL, #0x10 -+ PF subges PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 -- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vmull.u8 q10, d22, d6 -- ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 - .endm - -diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h -index d276ab9..a2941ae 100644 ---- a/pixman/pixman-arm-neon-asm.h -+++ b/pixman/pixman-arm-neon-asm.h -@@ -58,6 +58,11 @@ - #define RESPECT_STRICT_ALIGNMENT 1 - - /* -+ * If set to nonzero value, prefetch is globally disabled -+ */ -+#define PREFETCH_GLOBALLY_DISABLED 0 -+ -+/* - * Definitions of supplementary pixld/pixst macros (for partial load/store of - * pixel data) - */ -@@ -218,37 +223,43 @@ - * pixels processing like simple copy. Anyway, having prefetch is a must - * when working with graphics data. - */ -+.macro PF a, x:vararg -+.if (ADVANCED_PREFETCH_ENABLED != 0) && (PREFETCH_GLOBALLY_DISABLED == 0) -+ a x -+.endif -+.endm -+ - .macro cache_preload std_increment, boost_increment - .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) - .if regs_shortage -- ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ -+ PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ - .endif - .if std_increment != 0 -- add PF_X, PF_X, #std_increment -+ PF add PF_X, PF_X, #std_increment - .endif -- tst PF_CTL, #0xF -- addne PF_X, PF_X, #boost_increment -- subne PF_CTL, PF_CTL, #1 -- cmp PF_X, ORIG_W -+ PF tst PF_CTL, #0xF -+ PF addne PF_X, PF_X, #boost_increment -+ PF subne PF_CTL, PF_CTL, #1 -+ PF cmp PF_X, ORIG_W - .if src_bpp_shift >= 0 -- pld [PF_SRC, PF_X, lsl #src_bpp_shift] -+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - .endif - .if dst_r_bpp != 0 -- pld [PF_DST, PF_X, lsl #dst_bpp_shift] -+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - .endif - .if mask_bpp_shift >= 0 -- pld [PF_MASK, PF_X, lsl #mask_bpp_shift] -+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] - .endif -- subge PF_X, PF_X, ORIG_W -- subges PF_CTL, PF_CTL, #0x10 -+ PF subge PF_X, PF_X, ORIG_W -+ PF subges PF_CTL, PF_CTL, #0x10 - .if src_bpp_shift >= 0 -- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - .endif - .if dst_r_bpp != 0 -- ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - .endif - .if mask_bpp_shift >= 0 -- ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! -+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! - .endif - .endif - .endm -@@ -297,6 +308,12 @@ fname: - PF_DST .req r12 - PF_MASK .req r14 - -+.if prefetch_distance == 0 -+ .set ADVANCED_PREFETCH_ENABLED, 0 -+.else -+ .set ADVANCED_PREFETCH_ENABLED, 1 -+.endif -+ - .if mask_bpp == 0 - ORIG_W .req r7 /* saved original width */ - DUMMY .req r8 /* temporary register */ -@@ -374,12 +391,12 @@ fname: - ldr MASK_STRIDE, [sp, #52] - .endif - mov DST_R, DST_W -- mov PF_SRC, SRC -- mov PF_DST, DST_R -- mov PF_MASK, MASK -- mov PF_CTL, H, lsl #4 -- /* pf_ctl = 10 | ((h - 1) << 4) */ -- add PF_CTL, #(prefetch_distance - 0x10) -+ PF mov PF_SRC, SRC -+ PF mov PF_DST, DST_R -+ PF mov PF_MASK, MASK -+ /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ -+ PF mov PF_CTL, H, lsl #4 -+ PF add PF_CTL, #(prefetch_distance - 0x10) - - init - .if regs_shortage -@@ -412,7 +429,7 @@ fname: - .else - add DST_R, DST_R, #lowbit - .endif -- add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) -+ PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) - sub W, W, #(lowbit * 8 / dst_w_bpp) - 1: - .endif -@@ -444,7 +461,7 @@ fname: - (src_basereg - pixblock_size * src_bpp / 64), SRC - pixld pixblock_size, mask_bpp, \ - (mask_basereg - pixblock_size * mask_bpp / 64), MASK -- add PF_X, PF_X, #pixblock_size -+ PF add PF_X, PF_X, #pixblock_size - process_pixblock_head - cache_preload 0, pixblock_size - subs W, W, #(pixblock_size * 2) -@@ -468,7 +485,7 @@ fname: - pixld chunk_size, src_bpp, src_basereg, SRC - pixld chunk_size, mask_bpp, mask_basereg, MASK - pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R -- add PF_X, PF_X, #chunk_size -+ PF add PF_X, PF_X, #chunk_size - 1: - .endif - .endr --- -cgit v0.8.2 diff --git a/recipes/xorg-lib/pixman/remove-broken.patch b/recipes/xorg-lib/pixman/remove-broken.patch deleted file mode 100644 index fd025b4bbd..0000000000 --- a/recipes/xorg-lib/pixman/remove-broken.patch +++ /dev/null @@ -1,826 +0,0 @@ -From: Siarhei Siamashka <siarhei.siamashka@nokia.com> -Date: Sun, 26 Jul 2009 22:21:26 +0000 (+0300) -Subject: ARM: Removal of unused/broken NEON code -X-Git-Url: http://siarhei.siamashka.name/gitweb/?p=pixman.git;a=commitdiff_plain;h=7ef2322eefcccc28a2d45c0da22c0fee88b8f464 - -ARM: Removal of unused/broken NEON code ---- - -diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c -index 4125d1b..9404c70 100644 ---- a/pixman/pixman-arm-neon.c -+++ b/pixman/pixman-arm-neon.c -@@ -1895,710 +1895,6 @@ pixman_fill_neon (uint32_t *bits, - #endif - } - --/* TODO: is there a more generic way of doing this being introduced? */ --#define NEON_SCANLINE_BUFFER_PIXELS (1024) -- --static inline void --neon_quadword_copy (void * dst, -- void * src, -- uint32_t count, /* of quadwords */ -- uint32_t trailer_count /* of bytes */) --{ -- uint8_t *t_dst = dst, *t_src = src; -- -- /* Uses aligned multi-register loads to maximise read bandwidth -- * on uncached memory such as framebuffers -- * The accesses do not have the aligned qualifiers, so that the copy -- * may convert between aligned-uncached and unaligned-cached memory. -- * It is assumed that the CPU can infer alignedness from the address. -- */ -- --#ifdef USE_GCC_INLINE_ASM -- -- asm volatile ( -- " cmp %[count], #8 \n" -- " blt 1f @ skip oversized fragments \n" -- "0: @ start with eight quadwords at a time \n" -- " sub %[count], %[count], #8 \n" -- " vld1.8 {d16, d17, d18, d19}, [%[src]]! \n" -- " vld1.8 {d20, d21, d22, d23}, [%[src]]! \n" -- " vld1.8 {d24, d25, d26, d27}, [%[src]]! \n" -- " vld1.8 {d28, d29, d30, d31}, [%[src]]! \n" -- " cmp %[count], #8 \n" -- " vst1.8 {d16, d17, d18, d19}, [%[dst]]! \n" -- " vst1.8 {d20, d21, d22, d23}, [%[dst]]! \n" -- " vst1.8 {d24, d25, d26, d27}, [%[dst]]! \n" -- " vst1.8 {d28, d29, d30, d31}, [%[dst]]! \n" -- " bge 0b \n" -- "1: @ four quadwords \n" -- " tst %[count], #4 \n" -- " beq 2f @ skip oversized fragment \n" -- " vld1.8 {d16, d17, d18, d19}, [%[src]]! \n" -- " vld1.8 {d20, d21, d22, d23}, [%[src]]! \n" -- " vst1.8 {d16, d17, d18, d19}, [%[dst]]! \n" -- " vst1.8 {d20, d21, d22, d23}, [%[dst]]! \n" -- "2: @ two quadwords \n" -- " tst %[count], #2 \n" -- " beq 3f @ skip oversized fragment \n" -- " vld1.8 {d16, d17, d18, d19}, [%[src]]! \n" -- " vst1.8 {d16, d17, d18, d19}, [%[dst]]! \n" -- "3: @ one quadword \n" -- " tst %[count], #1 \n" -- " beq 4f @ skip oversized fragment \n" -- " vld1.8 {d16, d17}, [%[src]]! \n" -- " vst1.8 {d16, d17}, [%[dst]]! \n" -- "4: @ end \n" -- -- /* Clobbered input registers marked as input/outputs */ -- : [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count) -- -- /* No unclobbered inputs */ -- : -- -- /* Clobbered vector registers */ -- : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", -- "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory"); -- --#else -- -- while (count >= 8) -- { -- uint8x16x4_t t1 = vld4q_u8 (t_src); -- uint8x16x4_t t2 = vld4q_u8 (t_src + sizeof(uint8x16x4_t)); -- -- t_src += sizeof(uint8x16x4_t) * 2; -- vst4q_u8 (t_dst, t1); -- vst4q_u8 (t_dst + sizeof(uint8x16x4_t), t2); -- t_dst += sizeof(uint8x16x4_t) * 2; -- count -= 8; -- } -- -- if (count & 4) -- { -- uint8x16x4_t t1 = vld4q_u8 (t_src); -- -- t_src += sizeof(uint8x16x4_t); -- vst4q_u8 (t_dst, t1); -- t_dst += sizeof(uint8x16x4_t); -- } -- -- if (count & 2) -- { -- uint8x8x4_t t1 = vld4_u8 (t_src); -- -- t_src += sizeof(uint8x8x4_t); -- vst4_u8 (t_dst, t1); -- t_dst += sizeof(uint8x8x4_t); -- } -- -- if (count & 1) -- { -- uint8x16_t t1 = vld1q_u8 (t_src); -- -- t_src += sizeof(uint8x16_t); -- vst1q_u8 (t_dst, t1); -- t_dst += sizeof(uint8x16_t); -- } -- --#endif /* !USE_GCC_INLINE_ASM */ -- -- if (trailer_count) -- { -- if (trailer_count & 8) -- { -- uint8x8_t t1 = vld1_u8 (t_src); -- -- t_src += sizeof(uint8x8_t); -- vst1_u8 (t_dst, t1); -- t_dst += sizeof(uint8x8_t); -- } -- -- if (trailer_count & 4) -- { -- *((uint32_t*) t_dst) = *((uint32_t*) t_src); -- -- t_dst += 4; -- t_src += 4; -- } -- -- if (trailer_count & 2) -- { -- *((uint16_t*) t_dst) = *((uint16_t*) t_src); -- -- t_dst += 2; -- t_src += 2; -- } -- -- if (trailer_count & 1) -- { -- *t_dst++ = *t_src++; -- } -- } --} -- --static inline void --solid_over_565_8_pix_neon (uint32_t glyph_colour, -- uint16_t *dest, -- uint8_t * in_mask, -- uint32_t dest_stride, /* bytes, not elements */ -- uint32_t mask_stride, -- uint32_t count /* 8-pixel groups */) --{ -- /* Inner loop of glyph blitter (solid colour, alpha mask) */ -- --#ifdef USE_GCC_INLINE_ASM -- -- asm volatile ( -- " vld4.8 {d20[], d21[], d22[], d23[]}, [%[glyph_colour]] @ splat solid colour components \n" -- "0: @ loop \n" -- " vld1.16 {d0, d1}, [%[dest]] @ load first pixels from framebuffer \n" -- " vld1.8 {d17}, [%[in_mask]] @ load alpha mask of glyph \n" -- " vmull.u8 q9, d17, d23 @ apply glyph colour alpha to mask \n" -- " vshrn.u16 d17, q9, #8 @ reformat it to match original mask \n" -- " vmvn d18, d17 @ we need the inverse mask for the background \n" -- " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n" -- " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n" -- " vshrn.u16 d4, q0, #3 @ unpack green \n" -- " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n" -- " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n" -- " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n" -- " vmull.u8 q1, d2, d18 @ apply inverse mask to background red... \n" -- " vmull.u8 q2, d4, d18 @ ...green... \n" -- " vmull.u8 q3, d6, d18 @ ...blue \n" -- " subs %[count], %[count], #1 @ decrement/test loop counter \n" -- " vmlal.u8 q1, d17, d22 @ add masked foreground red... \n" -- " vmlal.u8 q2, d17, d21 @ ...green... \n" -- " vmlal.u8 q3, d17, d20 @ ...blue \n" -- " add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait \n" -- " vsri.16 q1, q2, #5 @ pack green behind red \n" -- " vsri.16 q1, q3, #11 @ pack blue into pixels \n" -- " vst1.16 {d2, d3}, [%[dest]] @ store composited pixels \n" -- " add %[dest], %[dest], %[dest_stride] @ advance framebuffer pointer \n" -- " bne 0b @ next please \n" -- -- /* Clobbered registers marked as input/outputs */ -- : [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count) -- -- /* Inputs */ -- : [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour) -- -- /* Clobbers, including the inputs we modify, and potentially lots of memory */ -- : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d19", -- "d20", "d21", "d22", "d23", "d24", "d25", "cc", "memory" -- ); -- --#else -- -- uint8x8x4_t solid_colour = vld4_dup_u8 ((uint8_t*) &glyph_colour); -- -- while (count--) -- { -- uint16x8_t pixels = vld1q_u16 (dest); -- uint8x8_t mask = vshrn_n_u16 (vmull_u8 (solid_colour.val[3], vld1_u8 (in_mask)), 8); -- uint8x8_t mask_image = vmvn_u8 (mask); -- -- uint8x8_t t_red = vshrn_n_u16 (pixels, 8); -- uint8x8_t t_green = vshrn_n_u16 (pixels, 3); -- uint8x8_t t_blue = vshrn_n_u16 (vsli_n_u8 (pixels, pixels, 5), 2); -- -- uint16x8_t s_red = vmull_u8 (vsri_n_u8 (t_red, t_red, 5), mask_image); -- uint16x8_t s_green = vmull_u8 (vsri_n_u8 (t_green, t_green, 6), mask_image); -- uint16x8_t s_blue = vmull_u8 (t_blue, mask_image); -- -- s_red = vmlal (s_red, mask, solid_colour.val[2]); -- s_green = vmlal (s_green, mask, solid_colour.val[1]); -- s_blue = vmlal (s_blue, mask, solid_colour.val[0]); -- -- pixels = vsri_n_u16 (s_red, s_green, 5); -- pixels = vsri_n_u16 (pixels, s_blue, 11); -- vst1q_u16 (dest, pixels); -- -- dest += dest_stride; -- mask += mask_stride; -- } -- --#endif --} -- --#if 0 /* this is broken currently */ --static void --neon_composite_over_n_8_0565 (pixman_implementation_t * impl, -- pixman_op_t op, -- pixman_image_t * src_image, -- pixman_image_t * mask_image, -- pixman_image_t * dst_image, -- int32_t src_x, -- int32_t src_y, -- int32_t mask_x, -- int32_t mask_y, -- int32_t dest_x, -- int32_t dest_y, -- int32_t width, -- int32_t height) --{ -- uint32_t src, srca; -- uint16_t *dst_line, *aligned_line; -- uint8_t *mask_line; -- uint32_t dst_stride, mask_stride; -- uint32_t kernel_count, copy_count, copy_tail; -- uint8_t kernel_offset, copy_offset; -- -- src = _pixman_image_get_solid (src_image, dst_image->bits.format); -- -- /* bail out if fully transparent or degenerate */ -- srca = src >> 24; -- if (src == 0) -- return; -- -- if (width == 0 || height == 0) -- return; -- -- if (width > NEON_SCANLINE_BUFFER_PIXELS) -- { -- /* split the blit, so we can use a fixed-size scanline buffer -- * TODO: there must be a more elegant way of doing this. -- */ -- int x; -- for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) -- { -- neon_composite_over_n_8_0565 ( -- impl, op, -- src_image, mask_image, dst_image, -- src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y, -- (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height); -- } -- -- return; -- } -- -- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); -- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); -- -- /* keep within minimum number of aligned quadwords on width -- * while also keeping the minimum number of columns to process -- */ -- { -- unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF; -- unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF; -- unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF; -- -- /* the fast copy should be quadword aligned */ -- copy_offset = dst_line - ((uint16_t*) aligned_left); -- aligned_line = dst_line - copy_offset; -- copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4); -- copy_tail = 0; -- -- if (aligned_right - aligned_left > ceiling_length) -- { -- /* unaligned routine is tightest */ -- kernel_count = (uint32_t) (ceiling_length >> 4); -- kernel_offset = copy_offset; -- } -- else -- { -- /* aligned routine is equally tight, so it is safer to align */ -- kernel_count = copy_count; -- kernel_offset = 0; -- } -- -- /* We should avoid reading beyond scanline ends for safety */ -- if (aligned_line < (dst_line - dest_x) || -- (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width)) -- { -- /* switch to precise read */ -- copy_offset = kernel_offset = 0; -- aligned_line = dst_line; -- kernel_count = (uint32_t) (ceiling_length >> 4); -- copy_count = (width * sizeof(*dst_line)) >> 4; -- copy_tail = (width * sizeof(*dst_line)) & 0xF; -- } -- } -- -- { -- uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */ -- uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; -- int y = height; -- -- /* row-major order */ -- /* left edge, middle block, right edge */ -- for ( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride) -- { -- /* We don't want to overrun the edges of the glyph, -- * so realign the edge data into known buffers -- */ -- neon_quadword_copy (glyph_line + copy_offset, mask_line, width >> 4, width & 0xF); -- -- /* Uncached framebuffer access is really, really slow -- * if we do it piecemeal. It should be much faster if we -- * grab it all at once. One scanline should easily fit in -- * L1 cache, so this should not waste RAM bandwidth. -- */ -- neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail); -- -- /* Apply the actual filter */ -- solid_over_565_8_pix_neon ( -- src, scan_line + kernel_offset, -- glyph_line + kernel_offset, 8 * sizeof(*dst_line), -- 8, kernel_count); -- -- /* Copy the modified scanline back */ -- neon_quadword_copy (dst_line, scan_line + copy_offset, -- width >> 3, (width & 7) * 2); -- } -- } --} --#endif -- --#ifdef USE_GCC_INLINE_ASM -- --static inline void --plain_over_565_8_pix_neon (uint32_t colour, -- uint16_t *dest, -- uint32_t dest_stride, /* bytes, not elements */ -- uint32_t count /* 8-pixel groups */) --{ -- /* Inner loop for plain translucent rects -- * (solid colour without alpha mask) -- */ -- asm volatile ( -- " vld4.8 {d20[], d21[], d22[], d23[]}, [%[colour]] @ solid colour load/splat \n" -- " vmull.u8 q12, d23, d22 @ premultiply alpha red \n" -- " vmull.u8 q13, d23, d21 @ premultiply alpha green \n" -- " vmull.u8 q14, d23, d20 @ premultiply alpha blue \n" -- " vmvn d18, d23 @ inverse alpha for background \n" -- "0: @ loop\n" -- " vld1.16 {d0, d1}, [%[dest]] @ load first pixels from framebuffer \n" -- " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n" -- " vshrn.u16 d4, q0, #3 @ unpack green \n" -- " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n" -- " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n" -- " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n" -- " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n" -- " vmov q0, q12 @ retrieve foreground red \n" -- " vmlal.u8 q0, d2, d18 @ blend red - my kingdom for a four-operand MLA \n" -- " vmov q1, q13 @ retrieve foreground green \n" -- " vmlal.u8 q1, d4, d18 @ blend green \n" -- " vmov q2, q14 @ retrieve foreground blue \n" -- " vmlal.u8 q2, d6, d18 @ blend blue \n" -- " subs %[count], %[count], #1 @ decrement/test loop counter \n" -- " vsri.16 q0, q1, #5 @ pack green behind red \n" -- " vsri.16 q0, q2, #11 @ pack blue into pixels \n" -- " vst1.16 {d0, d1}, [%[dest]] @ store composited pixels \n" -- " add %[dest], %[dest], %[dest_stride] @ advance framebuffer pointer \n" -- " bne 0b @ next please \n" -- -- /* Clobbered registers marked as input/outputs */ -- : [dest] "+r" (dest), [count] "+r" (count) -- -- /* Inputs */ -- : [dest_stride] "r" (dest_stride), [colour] "r" (&colour) -- -- /* Clobbers, including the inputs we modify, and -- * potentially lots of memory -- */ -- : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d18", "d19", -- "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", -- "cc", "memory" -- ); --} -- --static void --neon_composite_over_n_0565 (pixman_implementation_t * impl, -- pixman_op_t op, -- pixman_image_t * src_image, -- pixman_image_t * mask_image, -- pixman_image_t * dst_image, -- int32_t src_x, -- int32_t src_y, -- int32_t mask_x, -- int32_t mask_y, -- int32_t dest_x, -- int32_t dest_y, -- int32_t width, -- int32_t height) --{ -- uint32_t src, srca; -- uint16_t *dst_line, *aligned_line; -- uint32_t dst_stride; -- uint32_t kernel_count, copy_count, copy_tail; -- uint8_t kernel_offset, copy_offset; -- -- src = _pixman_image_get_solid (src_image, dst_image->bits.format); -- -- /* bail out if fully transparent */ -- srca = src >> 24; -- if (src == 0) -- return; -- -- if (width == 0 || height == 0) -- return; -- -- if (width > NEON_SCANLINE_BUFFER_PIXELS) -- { -- /* split the blit, so we can use a fixed-size scanline buffer * -- * TODO: there must be a more elegant way of doing this. -- */ -- int x; -- -- for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) -- { -- neon_composite_over_n_0565 ( -- impl, op, -- src_image, mask_image, dst_image, -- src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y, -- (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height); -- } -- return; -- } -- -- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); -- -- /* keep within minimum number of aligned quadwords on width -- * while also keeping the minimum number of columns to process -- */ -- { -- unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF; -- unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF; -- unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF; -- -- /* the fast copy should be quadword aligned */ -- copy_offset = dst_line - ((uint16_t*) aligned_left); -- aligned_line = dst_line - copy_offset; -- copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4); -- copy_tail = 0; -- -- if (aligned_right - aligned_left > ceiling_length) -- { -- /* unaligned routine is tightest */ -- kernel_count = (uint32_t) (ceiling_length >> 4); -- kernel_offset = copy_offset; -- } -- else -- { -- /* aligned routine is equally tight, so it is safer to align */ -- kernel_count = copy_count; -- kernel_offset = 0; -- } -- -- /* We should avoid reading beyond scanline ends for safety */ -- if (aligned_line < (dst_line - dest_x) || -- (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width)) -- { -- /* switch to precise read */ -- copy_offset = kernel_offset = 0; -- aligned_line = dst_line; -- kernel_count = (uint32_t) (ceiling_length >> 4); -- copy_count = (width * sizeof(*dst_line)) >> 4; -- copy_tail = (width * sizeof(*dst_line)) & 0xF; -- } -- } -- -- { -- uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */ -- -- /* row-major order */ -- /* left edge, middle block, right edge */ -- for ( ; height--; aligned_line += dst_stride, dst_line += dst_stride) -- { -- /* Uncached framebuffer access is really, really slow if we do it piecemeal. -- * It should be much faster if we grab it all at once. -- * One scanline should easily fit in L1 cache, so this should -- * not waste RAM bandwidth. -- */ -- neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail); -- -- /* Apply the actual filter */ -- plain_over_565_8_pix_neon ( -- src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count); -- -- /* Copy the modified scanline back */ -- neon_quadword_copy ( -- dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2); -- } -- } --} -- --static inline void --ARGB8_over_565_8_pix_neon (uint32_t *src, -- uint16_t *dest, -- uint32_t src_stride, /* bytes, not elements */ -- uint32_t count /* 8-pixel groups */) --{ -- asm volatile ( -- "0: @ loop\n" -- " pld [%[src], %[src_stride]] @ preload from next scanline \n" -- " vld1.16 {d0, d1}, [%[dest]] @ load pixels from framebuffer \n" -- " vld4.8 {d20, d21, d22, d23},[%[src]]! @ load source image pixels \n" -- " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n" -- " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n" -- " vshrn.u16 d4, q0, #3 @ unpack green \n" -- " vmvn d18, d23 @ we need the inverse alpha for the background \n" -- " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n" -- " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n" -- " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n" -- " vmull.u8 q1, d2, d18 @ apply inverse alpha to background red... \n" -- " vmull.u8 q2, d4, d18 @ ...green... \n" -- " vmull.u8 q3, d6, d18 @ ...blue \n" -- " subs %[count], %[count], #1 @ decrement/test loop counter \n" -- " vmlal.u8 q1, d23, d22 @ add blended foreground red... \n" -- " vmlal.u8 q2, d23, d21 @ ...green... \n" -- " vmlal.u8 q3, d23, d20 @ ...blue \n" -- " vsri.16 q1, q2, #5 @ pack green behind red \n" -- " vsri.16 q1, q3, #11 @ pack blue into pixels \n" -- " vst1.16 {d2, d3}, [%[dest]]! @ store composited pixels \n" -- " bne 0b @ next please \n" -- -- /* Clobbered registers marked as input/outputs */ -- : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count) -- -- /* Inputs */ -- : [src_stride] "r" (src_stride) -- -- /* Clobbers, including the inputs we modify, and potentially lots of memory */ -- : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d20", -- "d21", "d22", "d23", "cc", "memory" -- ); --} -- --static void --neon_composite_over_8888_0565 (pixman_implementation_t * impl, -- pixman_op_t op, -- pixman_image_t * src_image, -- pixman_image_t * mask_image, -- pixman_image_t * dst_image, -- int32_t src_x, -- int32_t src_y, -- int32_t mask_x, -- int32_t mask_y, -- int32_t dest_x, -- int32_t dest_y, -- int32_t width, -- int32_t height) --{ -- uint32_t *src_line; -- uint16_t *dst_line, *aligned_line; -- uint32_t dst_stride, src_stride; -- uint32_t kernel_count, copy_count, copy_tail; -- uint8_t kernel_offset, copy_offset; -- -- /* we assume mask is opaque -- * so the only alpha to deal with is embedded in src -- */ -- if (width > NEON_SCANLINE_BUFFER_PIXELS) -- { -- /* split the blit, so we can use a fixed-size scanline buffer */ -- int x; -- for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) -- { -- neon_composite_over_8888_0565 ( -- impl, op, -- src_image, mask_image, dst_image, -- src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y, -- (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height); -- } -- return; -- } -- -- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); -- PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); -- -- /* keep within minimum number of aligned quadwords on width -- * while also keeping the minimum number of columns to process -- */ -- { -- unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF; -- unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF; -- unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF; -- -- /* the fast copy should be quadword aligned */ -- copy_offset = dst_line - ((uint16_t*) aligned_left); -- aligned_line = dst_line - copy_offset; -- copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4); -- copy_tail = 0; -- -- if (aligned_right - aligned_left > ceiling_length) -- { -- /* unaligned routine is tightest */ -- kernel_count = (uint32_t) (ceiling_length >> 4); -- kernel_offset = copy_offset; -- } -- else -- { -- /* aligned routine is equally tight, so it is safer to align */ -- kernel_count = copy_count; -- kernel_offset = 0; -- } -- -- /* We should avoid reading beyond scanline ends for safety */ -- if (aligned_line < (dst_line - dest_x) || -- (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width)) -- { -- /* switch to precise read */ -- copy_offset = kernel_offset = 0; -- aligned_line = dst_line; -- kernel_count = (uint32_t) (ceiling_length >> 4); -- copy_count = (width * sizeof(*dst_line)) >> 4; -- copy_tail = (width * sizeof(*dst_line)) & 0xF; -- } -- } -- -- /* Preload the first input scanline */ -- { -- uint8_t *src_ptr = (uint8_t*) src_line; -- uint32_t count = (width + 15) / 16; -- --#ifdef USE_GCC_INLINE_ASM -- asm volatile ( -- "0: @ loop \n" -- " subs %[count], %[count], #1 \n" -- " pld [%[src]] \n" -- " add %[src], %[src], #64 \n" -- " bgt 0b \n" -- -- /* Clobbered input registers marked as input/outputs */ -- : [src] "+r" (src_ptr), [count] "+r" (count) -- : /* no unclobbered inputs */ -- : "cc" -- ); --#else -- do -- { -- __pld (src_ptr); -- src_ptr += 64; -- } -- while (--count); --#endif -- } -- -- { -- uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */ -- -- /* row-major order */ -- /* left edge, middle block, right edge */ -- for ( ; height--; src_line += src_stride, aligned_line += dst_stride) -- { -- /* Uncached framebuffer access is really, really slow if we do -- * it piecemeal. It should be much faster if we grab it all at -- * once. One scanline should easily fit in L1 cache, so this -- * should not waste RAM bandwidth. -- */ -- neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail); -- -- /* Apply the actual filter */ -- ARGB8_over_565_8_pix_neon ( -- src_line, scan_line + kernel_offset, -- src_stride * sizeof(*src_line), kernel_count); -- -- /* Copy the modified scanline back */ -- neon_quadword_copy (dst_line, -- scan_line + copy_offset, -- width >> 3, (width & 7) * 2); -- } -- } --} -- --#endif /* USE_GCC_INLINE_ASM */ -- - static const pixman_fast_path_t arm_neon_fast_path_array[] = - { - { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, neon_composite_add_8888_8_8, 0 }, -@@ -2612,12 +1908,6 @@ static const pixman_fast_path_t arm_neon_fast_path_array[] = - #ifdef USE_GCC_INLINE_ASM - { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_16_16, 0 }, - { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_16_16, 0 }, --#if 0 /* this code has some bugs */ -- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_n_0565, 0 }, -- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_n_0565, 0 }, -- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_8888_0565, 0 }, -- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_8888_0565, 0 }, --#endif - #endif - { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_over_8888_8888, 0 }, -@@ -2668,79 +1958,6 @@ arm_neon_composite (pixman_implementation_t *imp, - } - - static pixman_bool_t --pixman_blt_neon (void *src_bits, -- void *dst_bits, -- int src_stride, -- int dst_stride, -- int src_bpp, -- int dst_bpp, -- int src_x, -- int src_y, -- int dst_x, -- int dst_y, -- int width, -- int height) --{ -- if (!width || !height) -- return TRUE; -- -- /* accelerate only straight copies involving complete bytes */ -- if (src_bpp != dst_bpp || (src_bpp & 7)) -- return FALSE; -- -- { -- uint32_t bytes_per_pixel = src_bpp >> 3; -- uint32_t byte_width = width * bytes_per_pixel; -- /* parameter is in words for some reason */ -- int32_t src_stride_bytes = src_stride * 4; -- int32_t dst_stride_bytes = dst_stride * 4; -- uint8_t *src_bytes = ((uint8_t*) src_bits) + -- src_y * src_stride_bytes + src_x * bytes_per_pixel; -- uint8_t *dst_bytes = ((uint8_t*) dst_bits) + -- dst_y * dst_stride_bytes + dst_x * bytes_per_pixel; -- uint32_t quadword_count = byte_width / 16; -- uint32_t offset = byte_width % 16; -- -- while (height--) -- { -- neon_quadword_copy (dst_bytes, src_bytes, quadword_count, offset); -- src_bytes += src_stride_bytes; -- dst_bytes += dst_stride_bytes; -- } -- } -- -- return TRUE; --} -- --static pixman_bool_t --arm_neon_blt (pixman_implementation_t *imp, -- uint32_t * src_bits, -- uint32_t * dst_bits, -- int src_stride, -- int dst_stride, -- int src_bpp, -- int dst_bpp, -- int src_x, -- int src_y, -- int dst_x, -- int dst_y, -- int width, -- int height) --{ -- if (pixman_blt_neon ( -- src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, -- src_x, src_y, dst_x, dst_y, width, height)) -- { -- return TRUE; -- } -- -- return _pixman_implementation_blt ( -- imp->delegate, -- src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, -- src_x, src_y, dst_x, dst_y, width, height); --} -- --static pixman_bool_t - arm_neon_fill (pixman_implementation_t *imp, - uint32_t * bits, - int stride, -@@ -2765,9 +1982,6 @@ _pixman_implementation_create_arm_neon (void) - pixman_implementation_t *imp = _pixman_implementation_create (simd); - - imp->composite = arm_neon_composite; --#if 0 /* this code has some bugs */ -- imp->blt = arm_neon_blt; --#endif - imp->fill = arm_neon_fill; - - return imp; diff --git a/recipes/xorg-lib/pixman/src-8888-0565.patch b/recipes/xorg-lib/pixman/src-8888-0565.patch deleted file mode 100644 index c544225f65..0000000000 --- a/recipes/xorg-lib/pixman/src-8888-0565.patch +++ /dev/null @@ -1,324 +0,0 @@ -From 6494f9ae8820078d0e6109bf8f294156f7a5da4c Mon Sep 17 00:00:00 2001 -From: Siarhei Siamashka <siarhei.siamashka@nokia.com> -Date: Fri, 05 Mar 2010 00:40:34 +0000 -Subject: ARM: added 'armv6_composite_src_8888_0565' fast path - -Provides ~3x performance improvement when working with -data in L1 cache, and ~80% performace improvement when working -with memory. This fast path is important for 32bpp -> 16bpp -color format conversion and is commonly used with 16bpp desktop. - -Microbenchmark from N800 (ARM11 @ 400MHz), measured in MPix/s: - -before: - - src_8888_0565 = L1: 21.54 M: 15.62 - -after (armv4): - - src_8888_0565 = L1: 45.26 M: 23.29 - -after (armv6): - - src_8888_0565 = L1: 60.62 M: 28.37 ---- -diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c -index c375c01..69243c1 100644 ---- a/pixman/pixman-arm-simd.c -+++ b/pixman/pixman-arm-simd.c -@@ -604,6 +604,282 @@ armv6_composite_over_n_8_0565 (pixman_implementation_t * impl, - dst_stride - width, mask_stride - width, height); - } - -+static inline void -+armv4_composite_src_8888_0565_asm ( -+ uint16_t *dst, uint32_t *src, int w, int dst_stride, -+ int src_stride, int h) -+{ -+ uint32_t a, x, y, c1F001F = 0x1F001F, cFFFF = 0xFFFF; -+ int backup_w = w; -+ while (h--) -+ { -+ w = backup_w; -+ if (w > 0 && (uintptr_t)dst & 2) -+ { -+ x = *src++; -+ -+ a = (x >> 3) & c1F001F; -+ x &= 0xFC00; -+ a |= a >> 5; -+ a |= x >> 5; -+ -+ *dst++ = a; -+ w--; -+ } -+ -+ asm volatile( -+ "subs %[w], %[w], #2\n" -+ "blt 2f\n" -+ "1:\n" -+ "ldr %[x], [%[src]], #4\n" -+ "ldr %[y], [%[src]], #4\n" -+ "subs %[w], %[w], #2\n" -+ -+ "and %[a], %[c1F001F], %[x], lsr #3\n" -+ "and %[x], %[x], #0xFC00\n\n" -+ "orr %[a], %[a], %[a], lsr #5\n" -+ "orr %[x], %[a], %[x], lsr #5\n" -+ -+ "and %[a], %[c1F001F], %[y], lsr #3\n" -+ "and %[y], %[y], #0xFC00\n\n" -+ "orr %[a], %[a], %[a], lsr #5\n" -+ "orr %[y], %[a], %[y], lsr #5\n" -+ /* -+ * Writing single 32-bit value is much faster than two -+ * separate 16-bit values for older CPUs without (efficient) -+ * write combining, even though it costs an extra instruction. -+ */ -+ "and %[x], %[x], %[cFFFF]\n" -+ "orr %[x], %[x], %[y], lsl #16\n" -+ "str %[x], [%[dst]], #4\n" -+ "bge 1b\n" -+ "2:\n" -+ : [c1F001F] "+&r" (c1F001F), [cFFFF] "+&r" (cFFFF), -+ [src] "+&r" (src), [dst] "+&r" (dst), [a] "=&r" (a), -+ [x] "=&r" (x), [y] "=&r" (y), [w] "+&r" (w) -+ ); -+ -+ if (w & 1) -+ { -+ x = *src++; -+ -+ a = (x >> 3) & c1F001F; -+ x = x & 0xFC00; -+ a |= a >> 5; -+ a |= x >> 5; -+ -+ *dst++ = a; -+ } -+ -+ src += src_stride - backup_w; -+ dst += dst_stride - backup_w; -+ } -+} -+ -+/* -+ * Conversion x8r8g8b8 -> r5g6b5 -+ * -+ * Note: 'w' must be >= 7 here -+ */ -+static void __attribute__((naked)) -+armv6_composite_src_8888_0565_asm ( -+ uint16_t *dst, uint32_t *src, int w, int dst_stride, -+ int src_stride, int h) -+{ -+ asm volatile( -+ /* define supplementary macros */ -+ ".macro cvt8888to565 PIX\n" -+ "and A, C1F001F, \\PIX, lsr #3\n" -+ "and \\PIX, \\PIX, #0xFC00\n\n" -+ "orr A, A, A, lsr #5\n" -+ "orr \\PIX, A, \\PIX, lsr #5\n" -+ ".endm\n" -+ -+ ".macro combine_pixels_pair PIX1, PIX2\n" -+ /* Note: assume little endian byte order */ -+ "pkhbt \\PIX1, \\PIX1, \\PIX2, lsl #16\n" -+ ".endm\n" -+ -+ /* function entry, save all registers (10 words) to stack */ -+ "stmdb sp!, {r4-r11, ip, lr}\n" -+ -+ /* define some aliases */ -+ "DST .req r0\n" -+ "SRC .req r1\n" -+ "W .req r2\n" -+ "H .req r3\n" -+ -+ "TMP1 .req r4\n" -+ "TMP2 .req r5\n" -+ "TMP3 .req r6\n" -+ "TMP4 .req r7\n" -+ "TMP5 .req r8\n" -+ "TMP6 .req r9\n" -+ "TMP7 .req r10\n" -+ "TMP8 .req r11\n" -+ -+ "C1F001F .req ip\n" -+ "A .req lr\n" -+ -+ "ldr TMP1, [sp, #(10*4+0)]\n" /* load src_stride */ -+ "ldr C1F001F, =0x1F001F\n" -+ "sub r3, r3, W\n" -+ "str r3, [sp, #(10*4+0)]\n" /* store (dst_stride-w) */ -+ "ldr r3, [sp, #(10*4+4)]\n" /* load h */ -+ "sub TMP1, TMP1, W\n" -+ "str TMP1, [sp, #(10*4+4)]\n" /* store (src_stride-w) */ -+ -+ "str W, [sp, #(8*4)]\n" /* saved ip = W */ -+ -+ "0:\n" -+ "subs H, H, #1\n" -+ "blt 6f\n" -+ "1:\n" -+ /* align DST at 4 byte boundary */ -+ "tst DST, #2\n" -+ "beq 2f\n" -+ "ldr TMP1, [SRC], #4\n" -+ "sub W, W, #1\n" -+ "cvt8888to565 TMP1\n" -+ "strh TMP1, [DST], #2\n" -+ "2:" -+ /* align DST at 8 byte boundary */ -+ "tst DST, #4\n" -+ "beq 2f\n" -+ "ldmia SRC!, {TMP1, TMP2}\n" -+ "sub W, W, #2\n" -+ "cvt8888to565 TMP1\n" -+ "cvt8888to565 TMP2\n" -+ "combine_pixels_pair TMP1, TMP2\n" -+ "str TMP1, [DST], #4\n" -+ "2:" -+ /* align DST at 16 byte boundary */ -+ "tst DST, #8\n" -+ "beq 2f\n" -+ "ldmia SRC!, {TMP1, TMP2, TMP3, TMP4}\n" -+ "sub W, W, #4\n" -+ "cvt8888to565 TMP1\n" -+ "cvt8888to565 TMP2\n" -+ "cvt8888to565 TMP3\n" -+ "cvt8888to565 TMP4\n" -+ "combine_pixels_pair TMP1, TMP2\n" -+ "combine_pixels_pair TMP3, TMP4\n" -+ "stmia DST!, {TMP1, TMP3}\n" -+ "2:" -+ /* inner loop, process 8 pixels per iteration */ -+ "subs W, W, #8\n" -+ "blt 4f\n" -+ "3:\n" -+ "ldmia SRC!, {TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8}\n" -+ "subs W, W, #8\n" -+ "cvt8888to565 TMP1\n" -+ "cvt8888to565 TMP2\n" -+ "cvt8888to565 TMP3\n" -+ "cvt8888to565 TMP4\n" -+ "cvt8888to565 TMP5\n" -+ "cvt8888to565 TMP6\n" -+ "cvt8888to565 TMP7\n" -+ "cvt8888to565 TMP8\n" -+ "combine_pixels_pair TMP1, TMP2\n" -+ "combine_pixels_pair TMP3, TMP4\n" -+ "combine_pixels_pair TMP5, TMP6\n" -+ "combine_pixels_pair TMP7, TMP8\n" -+ "stmia DST!, {TMP1, TMP3, TMP5, TMP7}\n" -+ "bge 3b\n" -+ "4:\n" -+ -+ /* process the remaining pixels */ -+ "tst W, #4\n" -+ "beq 4f\n" -+ "ldmia SRC!, {TMP1, TMP2, TMP3, TMP4}\n" -+ "cvt8888to565 TMP1\n" -+ "cvt8888to565 TMP2\n" -+ "cvt8888to565 TMP3\n" -+ "cvt8888to565 TMP4\n" -+ "combine_pixels_pair TMP1, TMP2\n" -+ "combine_pixels_pair TMP3, TMP4\n" -+ "stmia DST!, {TMP1, TMP3}\n" -+ "4:\n" -+ "tst W, #2\n" -+ "beq 4f\n" -+ "ldmia SRC!, {TMP1, TMP2}\n" -+ "cvt8888to565 TMP1\n" -+ "cvt8888to565 TMP2\n" -+ "combine_pixels_pair TMP1, TMP2\n" -+ "str TMP1, [DST], #4\n" -+ "4:\n" -+ "tst W, #1\n" -+ "beq 4f\n" -+ "ldr TMP1, [SRC], #4\n" -+ "cvt8888to565 TMP1\n" -+ "strh TMP1, [DST], #2\n" -+ "4:\n" -+ "ldr TMP1, [sp, #(10*4+0)]\n" /* (dst_stride-w) */ -+ "ldr TMP2, [sp, #(10*4+4)]\n" /* (src_stride-w) */ -+ "ldr W, [sp, #(8*4)]\n" -+ "subs H, H, #1\n" -+ "add DST, DST, TMP1, lsl #1\n" -+ "add SRC, SRC, TMP2, lsl #2\n" -+ "bge 1b\n" -+ "6:\n" -+ /* restore all registers and return */ -+ "ldmia sp!, {r4-r11, ip, pc}\n" -+ ".ltorg\n" -+ -+ ".unreq DST\n" -+ ".unreq SRC\n" -+ ".unreq W\n" -+ ".unreq H\n" -+ -+ ".unreq TMP1\n" -+ ".unreq TMP2\n" -+ ".unreq TMP3\n" -+ ".unreq TMP4\n" -+ ".unreq TMP5\n" -+ ".unreq TMP6\n" -+ ".unreq TMP7\n" -+ ".unreq TMP8\n" -+ -+ ".unreq C1F001F\n" -+ ".unreq A\n" -+ -+ ".purgem cvt8888to565\n" -+ ".purgem combine_pixels_pair\n" -+ ); -+} -+ -+static void -+armv6_composite_src_8888_0565 (pixman_implementation_t * impl, -+ pixman_op_t op, -+ pixman_image_t * src_image, -+ pixman_image_t * mask_image, -+ pixman_image_t * dst_image, -+ int32_t src_x, -+ int32_t src_y, -+ int32_t mask_x, -+ int32_t mask_y, -+ int32_t dest_x, -+ int32_t dest_y, -+ int32_t width, -+ int32_t height) -+{ -+ uint32_t *src; -+ uint16_t *dst; -+ int src_stride, dst_stride; -+ -+ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, -+ dst_stride, dst, 1); -+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, -+ src_stride, src, 1); -+ -+ if (width < 7) -+ armv4_composite_src_8888_0565_asm (dst, src, width, -+ dst_stride, src_stride, height); -+ else -+ armv6_composite_src_8888_0565_asm (dst, src, width, -+ dst_stride, src_stride, height); -+} -+ - #endif - - static const pixman_fast_path_t arm_simd_fast_paths[] = -@@ -624,6 +900,10 @@ static const pixman_fast_path_t arm_simd_fast_paths[] = - #if defined(__ARM_EABI__) && defined(__linux__) - PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, armv6_composite_over_n_8_0565), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, armv6_composite_over_n_8_0565), -+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, armv6_composite_src_8888_0565), -+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, armv6_composite_src_8888_0565), -+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, armv6_composite_src_8888_0565), -+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, armv6_composite_src_8888_0565), - #endif - { PIXMAN_OP_NONE }, - }; --- -cgit v0.8.3-6-g21f6 diff --git a/recipes/xorg-lib/pixman/tls.patch b/recipes/xorg-lib/pixman/tls.patch deleted file mode 100644 index 316caed65f..0000000000 --- a/recipes/xorg-lib/pixman/tls.patch +++ /dev/null @@ -1,59 +0,0 @@ -From 714559dccda3165a72f0a9935c1edc3aef535f30 Mon Sep 17 00:00:00 2001 -From: Søren Sandmann Pedersen <ssp@redhat.com> -Date: Wed, 07 Apr 2010 05:44:12 +0000 -Subject: Fixes for pthread thread local storage. - -The tls_name_key variable is passed to tls_name_get(), and the first -time this happens it isn't initialized. tls_name_get() then passes it -on to tls_name_alloc() which passes it on to pthread_setspecific() -leading to undefined behavior. - -None of this is actually necessary at all because there is only one -such variable per thread local variable, so it doesn't need to passed -as a parameter at all. - -All of this was pointed out by Tor Lillqvist on the cairo mailing -list. ---- -diff --git a/pixman/pixman-compiler.h b/pixman/pixman-compiler.h -index cdac0d8..531c8c9 100644 ---- a/pixman/pixman-compiler.h -+++ b/pixman/pixman-compiler.h -@@ -99,16 +99,16 @@ - } \ - \ - static type * \ -- tls_ ## name ## _alloc (key) \ -+ tls_ ## name ## _alloc (void) \ - { \ - type *value = calloc (1, sizeof (type)); \ - if (value) \ -- pthread_setspecific (key, value); \ -+ pthread_setspecific (tls_ ## name ## _key, value); \ - return value; \ - } \ - \ - static force_inline type * \ -- tls_ ## name ## _get (key) \ -+ tls_ ## name ## _get (void) \ - { \ - type *value = NULL; \ - if (pthread_once (&tls_ ## name ## _once_control, \ -@@ -116,13 +116,13 @@ - { \ - value = pthread_getspecific (tls_ ## name ## _key); \ - if (!value) \ -- value = tls_ ## name ## _alloc (key); \ -+ value = tls_ ## name ## _alloc (); \ - } \ - return value; \ - } - - # define PIXMAN_GET_THREAD_LOCAL(name) \ -- tls_ ## name ## _get (tls_ ## name ## _key) -+ tls_ ## name ## _get () - - #else - --- -cgit v0.8.3-6-g21f6 |