diff options
author | Frans Meulenbroeks <fransmeulenbroeks@gmail.com> | 2010-10-10 13:08:14 +0200 |
---|---|---|
committer | Frans Meulenbroeks <fransmeulenbroeks@gmail.com> | 2010-10-10 13:08:14 +0200 |
commit | 067e6a9f69e91cf1d1fe46c972189791da1a8a7a (patch) | |
tree | 5055c1acdf03f0eb1a26a2344ab0cb00f9cc475f /recipes/xorg-driver | |
parent | 65abd0ee8631cad3fcd984a56bbcafb084cbbb54 (diff) | |
download | openembedded-067e6a9f69e91cf1d1fe46c972189791da1a8a7a.tar.gz |
xorg-driver : moved unused files to obsolete dir
Signed-off-by: Frans Meulenbroeks <fransmeulenbroeks@gmail.com>
Diffstat (limited to 'recipes/xorg-driver')
4 files changed, 0 insertions, 3075 deletions
diff --git a/recipes/xorg-driver/xf86-input-tslib/dontfloodevents006.patch b/recipes/xorg-driver/xf86-input-tslib/dontfloodevents006.patch deleted file mode 100644 index e989717d3b..0000000000 --- a/recipes/xorg-driver/xf86-input-tslib/dontfloodevents006.patch +++ /dev/null @@ -1,22 +0,0 @@ -Index: xf86-input-tslib-0.0.6/src/tslib.c -=================================================================== ---- xf86-input-tslib-0.0.6.orig/src/tslib.c 2009-11-29 20:03:29.734794324 +0000 -+++ xf86-input-tslib-0.0.6/src/tslib.c 2009-11-29 20:29:24.066794215 +0000 -@@ -205,7 +205,7 @@ - */ - switch (priv->state) { - case BUTTON_EMULATION_OFF : -- if(priv->lastp != samp.pressure) { -+ if(!!priv->lastp != !!samp.pressure) { - priv->lastp = samp.pressure; - xf86PostButtonEvent(local->dev, TRUE, - 1, !!samp.pressure, 0, 2, -@@ -512,7 +512,7 @@ - s = xf86CheckStrOption(dev->commonOptions, "path", NULL); - if (!s) - s = xf86CheckStrOption(dev->commonOptions, "Device", NULL); -- -+ - priv->ts = ts_open(s, 1); - xfree(s); - diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon.patch b/recipes/xorg-driver/xf86-video-msm/no_neon.patch deleted file mode 100644 index c0aa92e76a..0000000000 --- a/recipes/xorg-driver/xf86-video-msm/no_neon.patch +++ /dev/null @@ -1,2901 +0,0 @@ -commit d8910bf773fbecf7cdea359d4b530a3672e27180 -Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch> -Date: Wed Feb 10 16:18:39 2010 +0100 - - Removed neon because its not available in our kerneÃl - and so its causing trubble (Illegal instruction) - -diff --git git/src/msm-swblits.h git/src/msm-swblits.h -index f89f00e..a40b24b 100755 ---- git/src/msm-swblits.h -+++ git/src/msm-swblits.h -@@ -38,16 +38,6 @@ - #include <stdint.h> - #include <stdlib.h> - --/* Neon intrinsics are part of the ARM or GCC compiler used. */ --/* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */ --#include <arm_neon.h> -- --/* These are NEON-optimized functions linked to by various tests. */ --extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes); --extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes); --extern void memset16(uint16_t *dst, uint16_t value, int count); --extern void memset32(uint32_t *dst, uint32_t value, int count); -- - /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */ - #define BITS_PER_BYTE (8) - #define BYTES_PER_16BPP_PIXEL (2) -diff --git git/src/msm-swfill.c git/src/msm-swfill.c -index 108fd94..3dd1ef2 100755 ---- git/src/msm-swfill.c -+++ git/src/msm-swfill.c -@@ -212,7 +212,7 @@ memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int cou - } - } - -- -+/* - static inline void - memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count) - { -@@ -333,7 +333,7 @@ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count) - // Quickly fill remaining pixels (up to 7). - memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count); - } -- -+*/ - - static inline void - memset16_Test(uint16_t *dst, uint16_t src, int count) -@@ -368,7 +368,8 @@ memset16_Test(uint16_t *dst, uint16_t src, int count) - - // Copy remaining pixels using Neon and non-Neon instructions. - // NOTE: This assumes that dst is aligned optimally for Neon instructions. -- memset16_AssumesNeonAlignment((void *) dst, src, count); -+ //memset16_AssumesNeonAlignment((void *) dst, src, count); -+ memset((void *) dst, src, count); - } - } - -@@ -435,12 +436,14 @@ swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dp - if (w < 32) { - // For narrow rectangles, block signals only once for the entire rectangles. - BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); -- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); -+ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); -+ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); - UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); - } - else { - // For wider rectangles, block and unblock signals for every row. -- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); -+ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); -+ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); - } - } - -diff --git git/src/msm-swrender.c git/src/msm-swrender.c -index a7a9abc..835dc03 100755 ---- git/src/msm-swrender.c -+++ git/src/msm-swrender.c -@@ -214,160 +214,6 @@ swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src - } - } - break; -- case 7: if (xdir >= 0) { -- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); -- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir); -- return TRUE; -- } else { -- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir); -- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); -- return TRUE; -- } -- break; -- case 8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { -- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); -- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); -- return TRUE; -- } -- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { -- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); -- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); -- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; -- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; -- return TRUE; -- } -- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { -- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); -- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); -- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T); -- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T); -- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; -- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; -- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3; -- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4; -- return TRUE; -- } -- else { -- uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T); -- uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT16_T); -- uint16_t src3 = *(uint16_t *) (src+2*BYTES_PER_UINT16_T); -- uint16_t src4 = *(uint16_t *) (src+3*BYTES_PER_UINT16_T); -- uint16_t src5 = *(uint16_t *) (src+4*BYTES_PER_UINT16_T); -- uint16_t src6 = *(uint16_t *) (src+5*BYTES_PER_UINT16_T); -- uint16_t src7 = *(uint16_t *) (src+6*BYTES_PER_UINT16_T); -- uint16_t src8 = *(uint16_t *) (src+7*BYTES_PER_UINT16_T); -- *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1; -- *(uint16_t *) (dst+1*BYTES_PER_UINT16_T) = src2; -- *(uint16_t *) (dst+2*BYTES_PER_UINT16_T) = src3; -- *(uint16_t *) (dst+3*BYTES_PER_UINT16_T) = src4; -- *(uint16_t *) (dst+4*BYTES_PER_UINT16_T) = src5; -- *(uint16_t *) (dst+5*BYTES_PER_UINT16_T) = src6; -- *(uint16_t *) (dst+6*BYTES_PER_UINT16_T) = src7; -- *(uint16_t *) (dst+7*BYTES_PER_UINT16_T) = src8; -- return TRUE; -- } -- break; -- case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { -- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); -- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); -- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); -- return TRUE; -- } -- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { -- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); -- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); -- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T); -- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T); -- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; -- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; -- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3; -- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4; -- return TRUE; -- } -- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { -- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); -- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); -- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T); -- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T); -- uint32_t src5 = *(uint32_t *) (src+4*BYTES_PER_UINT32_T); -- uint32_t src6 = *(uint32_t *) (src+5*BYTES_PER_UINT32_T); -- uint32_t src7 = *(uint32_t *) (src+6*BYTES_PER_UINT32_T); -- uint32_t src8 = *(uint32_t *) (src+7*BYTES_PER_UINT32_T); -- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; -- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; -- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3; -- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4; -- *(uint32_t *) (dst+4*BYTES_PER_UINT32_T) = src5; -- *(uint32_t *) (dst+5*BYTES_PER_UINT32_T) = src6; -- *(uint32_t *) (dst+6*BYTES_PER_UINT32_T) = src7; -- *(uint32_t *) (dst+7*BYTES_PER_UINT32_T) = src8; -- return TRUE; -- } -- else { -- // Don't bother unrolling loops here, since that won't help for more than around 8 operations. -- // Instead, just call multiple fixed functions. -- if (xdir >= 0) { -- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); -- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir); -- } else { -- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir); -- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); -- } -- return TRUE; -- } -- break; -- case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { -- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T)); -- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T)); -- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); -- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); -- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3); -- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4); -- return TRUE; -- } -- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { -- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); -- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); -- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T); -- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T); -- uint64_t src5 = *(uint64_t *) (src+4*BYTES_PER_UINT64_T); -- uint64_t src6 = *(uint64_t *) (src+5*BYTES_PER_UINT64_T); -- uint64_t src7 = *(uint64_t *) (src+6*BYTES_PER_UINT64_T); -- uint64_t src8 = *(uint64_t *) (src+7*BYTES_PER_UINT64_T); -- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; -- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; -- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3; -- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4; -- *(uint64_t *) (dst+4*BYTES_PER_UINT64_T) = src5; -- *(uint64_t *) (dst+5*BYTES_PER_UINT64_T) = src6; -- *(uint64_t *) (dst+6*BYTES_PER_UINT64_T) = src7; -- *(uint64_t *) (dst+7*BYTES_PER_UINT64_T) = src8; -- return TRUE; -- } -- break; -- case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { -- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T)); -- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T)); -- uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T)); -- uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T)); -- uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T)); -- uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T)); -- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); -- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); -- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3); -- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4); -- vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5); -- vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6); -- vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7); -- vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8); -- return TRUE; -- } -- break; - } - - return FALSE; -@@ -519,427 +365,6 @@ swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *sr - } - return TRUE; - break; -- case 7: if (xdir >= 0) { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); -- } else { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); -- } -- return TRUE; -- break; -- case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); -- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); -- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); -- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); -- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; -- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; -- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; -- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); -- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); -- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); -- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); -- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); -- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); -- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); -- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); -- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; -- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; -- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; -- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; -- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; -- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; -- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; -- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { -- uint16_t src1a = *(uint16_t *) (src+0*spitch+0); -- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); -- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); -- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); -- uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); -- uint16_t src1b = *(uint16_t *) (src+1*spitch+0); -- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); -- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); -- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); -- uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); -- *(uint16_t *) (dst+0*dpitch+0) = src1a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; -- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; -- *(uint16_t *) (dst+1*dpitch+0) = src1b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; -- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; -- return TRUE; -- } -- else { -- uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); -- uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); -- uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); -- uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T); -- uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T); -- uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T); -- uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T); -- uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T); -- uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); -- uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); -- uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); -- uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T); -- uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T); -- uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T); -- uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T); -- uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T); -- *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; -- *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; -- *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; -- *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a; -- *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a; -- *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a; -- *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a; -- *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a; -- *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; -- *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; -- *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; -- *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b; -- *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b; -- *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b; -- *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b; -- *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b; -- return TRUE; -- } -- break; -- case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); -- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); -- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); -- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); -- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); -- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); -- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); -- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); -- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; -- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; -- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; -- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; -- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; -- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; -- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; -- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) { -- uint32_t src1a = *(uint32_t *) (src+0*spitch+0); -- uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); -- uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); -- uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); -- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); -- uint32_t src1b = *(uint32_t *) (src+1*spitch+0); -- uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); -- uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); -- uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); -- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); -- *(uint32_t *) (dst+0*dpitch+0) = src1a; -- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a; -- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a; -- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a; -- *(uint32_t *) (dst+1*dpitch+0) = src1b; -- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b; -- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b; -- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b; -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); -- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); -- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); -- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); -- uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T); -- uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T); -- uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T); -- uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T); -- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); -- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); -- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); -- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); -- uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T); -- uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T); -- uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T); -- uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T); -- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; -- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; -- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; -- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; -- *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a; -- *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a; -- *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a; -- *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a; -- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; -- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; -- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; -- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; -- *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b; -- *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b; -- *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b; -- *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b; -- return TRUE; -- } -- else { -- // Don't bother unrolling loops, since that won't help for more than around 8 operations. -- // Instead, just call multiple fixed functions. -- if (xdir >= 0) { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- } else { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); -- } -- return TRUE; -- } -- break; -- case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); -- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); -- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); -- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); -- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) { -- if (xdir >= 0) { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); -- } else { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); -- } -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); -- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); -- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); -- uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T); -- uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T); -- uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T); -- uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T); -- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); -- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); -- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); -- uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T); -- uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T); -- uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T); -- uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T); -- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; -- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; -- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; -- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; -- *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T) = src5a; -- *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T) = src6a; -- *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T) = src7a; -- *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T) = src8a; -- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; -- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; -- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; -- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; -- *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T) = src5b; -- *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T) = src6b; -- *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T) = src7b; -- *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T) = src8b; -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { -- if (xdir >= 0) { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); -- } else { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); -- } -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { -- if (xdir >= 0) { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); -- } else { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); -- } -- return TRUE; -- } -- else { -- if (xdir >= 0) { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- } else { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- } -- return TRUE; -- } -- break; -- case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); -- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); -- uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T)); -- uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T)); -- uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T)); -- uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); -- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); -- uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T)); -- uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T)); -- uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T)); -- uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T)); -- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4_T),src6a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+6*BYTES_PER_UINT32X4_T),src7a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+7*BYTES_PER_UINT32X4_T),src8a); -- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+4*BYTES_PER_UINT32X4_T),src5b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+5*BYTES_PER_UINT32X4_T),src6b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+6*BYTES_PER_UINT32X4_T),src7b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+7*BYTES_PER_UINT32X4_T),src8b); -- return TRUE; -- }//HERE -- else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) { -- if (xdir >= 0) { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); -- } else { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); -- } -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { -- if (xdir >= 0) { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); -- } else { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); -- } -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { -- if (xdir >= 0) { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); -- } else { -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); -- } -- return TRUE; -- } -- break; - } - - return FALSE; -@@ -1161,484 +586,7 @@ swCopy16BppSmallFixedWidths4Rows_Unaligned(unsigned char *dst, unsigned char *sr - } - return TRUE; - break; -- case 7: if (xdir >= 0) { -- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); -- } else { -- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); -- } -- return TRUE; -- break; -- // TODO: Add more alignment checks for 8 pixel-wide cases for performance reasons? -- // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,DOUBLE_WORD_ALIGNMENT_BYTE_SIZE/2)) and related half-aligned cases... -- case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); -- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); -- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); -- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); -- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); -- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); -- uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T); -- uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T); -- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; -- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; -- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; -- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; -- *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c; -- *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c; -- *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d; -- *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d; -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); -- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); -- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); -- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); -- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); -- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); -- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); -- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); -- uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T); -- uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T); -- uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T); -- uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T); -- uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T); -- uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T); -- uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T); -- uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T); -- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; -- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; -- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; -- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; -- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; -- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; -- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; -- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; -- *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c; -- *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c; -- *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c; -- *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c; -- *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d; -- *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d; -- *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d; -- *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d; -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { -- uint16_t src1a = *(uint16_t *) (src+0*spitch+0); -- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); -- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); -- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); -- uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); -- uint16_t src1b = *(uint16_t *) (src+1*spitch+0); -- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); -- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); -- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); -- uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); -- uint16_t src1c = *(uint16_t *) (src+2*spitch+0); -- uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); -- uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); -- uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); -- uint16_t src5c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); -- uint16_t src1d = *(uint16_t *) (src+3*spitch+0); -- uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); -- uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); -- uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); -- uint16_t src5d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); -- *(uint16_t *) (dst+0*dpitch+0) = src1a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; -- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; -- *(uint16_t *) (dst+1*dpitch+0) = src1b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; -- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; -- *(uint16_t *) (dst+2*dpitch+0) = src1c; -- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c; -- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c; -- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c; -- *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c; -- *(uint16_t *) (dst+3*dpitch+0) = src1d; -- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d; -- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d; -- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d; -- *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5d; -- return TRUE; -- } -- else { -- uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); -- uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); -- uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); -- uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T); -- uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T); -- uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T); -- uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T); -- uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T); -- uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); -- uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); -- uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); -- uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T); -- uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T); -- uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T); -- uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T); -- uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T); -- uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T); -- uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T); -- uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T); -- uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T); -- uint16_t src5c = *(uint16_t *) (src+2*spitch+4*BYTES_PER_UINT16_T); -- uint16_t src6c = *(uint16_t *) (src+2*spitch+5*BYTES_PER_UINT16_T); -- uint16_t src7c = *(uint16_t *) (src+2*spitch+6*BYTES_PER_UINT16_T); -- uint16_t src8c = *(uint16_t *) (src+2*spitch+7*BYTES_PER_UINT16_T); -- uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T); -- uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T); -- uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T); -- uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T); -- uint16_t src5d = *(uint16_t *) (src+3*spitch+4*BYTES_PER_UINT16_T); -- uint16_t src6d = *(uint16_t *) (src+3*spitch+5*BYTES_PER_UINT16_T); -- uint16_t src7d = *(uint16_t *) (src+3*spitch+6*BYTES_PER_UINT16_T); -- uint16_t src8d = *(uint16_t *) (src+3*spitch+7*BYTES_PER_UINT16_T); -- *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; -- *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; -- *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; -- *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a; -- *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a; -- *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a; -- *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a; -- *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a; -- *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; -- *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; -- *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; -- *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b; -- *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b; -- *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b; -- *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b; -- *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b; -- *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T) = src1c; -- *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T) = src2c; -- *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T) = src3c; -- *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T) = src4c; -- *(uint16_t *) (dst+2*dpitch+4*BYTES_PER_UINT16_T) = src5c; -- *(uint16_t *) (dst+2*dpitch+5*BYTES_PER_UINT16_T) = src6c; -- *(uint16_t *) (dst+2*dpitch+6*BYTES_PER_UINT16_T) = src7c; -- *(uint16_t *) (dst+2*dpitch+7*BYTES_PER_UINT16_T) = src8c; -- *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T) = src1d; -- *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T) = src2d; -- *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T) = src3d; -- *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T) = src4d; -- *(uint16_t *) (dst+3*dpitch+4*BYTES_PER_UINT16_T) = src5d; -- *(uint16_t *) (dst+3*dpitch+5*BYTES_PER_UINT16_T) = src6d; -- *(uint16_t *) (dst+3*dpitch+6*BYTES_PER_UINT16_T) = src7d; -- *(uint16_t *) (dst+3*dpitch+7*BYTES_PER_UINT16_T) = src8d; -- return TRUE; -- } -- break; -- case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T)); -- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); -- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); -- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); -- vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c); -- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); -- vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d); -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); -- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); -- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); -- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); -- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); -- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); -- uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T); -- uint64_t src3c = *(uint64_t *) (src+2*spitch+2*BYTES_PER_UINT64_T); -- uint64_t src4c = *(uint64_t *) (src+2*spitch+3*BYTES_PER_UINT64_T); -- uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T); -- uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T); -- uint64_t src3d = *(uint64_t *) (src+3*spitch+2*BYTES_PER_UINT64_T); -- uint64_t src4d = *(uint64_t *) (src+3*spitch+3*BYTES_PER_UINT64_T); -- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; -- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; -- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; -- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; -- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; -- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; -- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; -- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; -- *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c; -- *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c; -- *(uint64_t *) (dst+2*dpitch+2*BYTES_PER_UINT64_T) = src3c; -- *(uint64_t *) (dst+2*dpitch+3*BYTES_PER_UINT64_T) = src4c; -- *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d; -- *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d; -- *(uint64_t *) (dst+3*dpitch+2*BYTES_PER_UINT64_T) = src3d; -- *(uint64_t *) (dst+3*dpitch+3*BYTES_PER_UINT64_T) = src4d; -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { -- uint32_t src1a = *(uint32_t *) (src+0*spitch+0); -- uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); -- uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); -- uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); -- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); -- uint32_t src1b = *(uint32_t *) (src+1*spitch+0); -- uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); -- uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); -- uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); -- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); -- uint32_t src1c = *(uint32_t *) (src+2*spitch+0); -- uint64_t src2c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); -- uint64_t src3c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); -- uint64_t src4c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); -- uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); -- uint32_t src1d = *(uint32_t *) (src+3*spitch+0); -- uint64_t src2d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); -- uint64_t src3d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); -- uint64_t src4d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); -- uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); -- *(uint32_t *) (dst+0*dpitch+0) = src1a; -- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a; -- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a; -- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a; -- *(uint32_t *) (dst+1*dpitch+0) = src1b; -- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b; -- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b; -- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b; -- *(uint32_t *) (dst+2*dpitch+0) = src1c; -- *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2c; -- *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3c; -- *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4c; -- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5c; -- *(uint32_t *) (dst+3*dpitch+0) = src1d; -- *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2d; -- *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3d; -- *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4d; -- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5d; -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); -- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); -- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); -- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); -- uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T); -- uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T); -- uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T); -- uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T); -- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); -- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); -- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); -- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); -- uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T); -- uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T); -- uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T); -- uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T); -- uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T); -- uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T); -- uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T); -- uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T); -- uint32_t src5c = *(uint32_t *) (src+2*spitch+4*BYTES_PER_UINT32_T); -- uint32_t src6c = *(uint32_t *) (src+2*spitch+5*BYTES_PER_UINT32_T); -- uint32_t src7c = *(uint32_t *) (src+2*spitch+6*BYTES_PER_UINT32_T); -- uint32_t src8c = *(uint32_t *) (src+2*spitch+7*BYTES_PER_UINT32_T); -- uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T); -- uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T); -- uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T); -- uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T); -- uint32_t src5d = *(uint32_t *) (src+3*spitch+4*BYTES_PER_UINT32_T); -- uint32_t src6d = *(uint32_t *) (src+3*spitch+5*BYTES_PER_UINT32_T); -- uint32_t src7d = *(uint32_t *) (src+3*spitch+6*BYTES_PER_UINT32_T); -- uint32_t src8d = *(uint32_t *) (src+3*spitch+7*BYTES_PER_UINT32_T); -- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; -- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; -- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; -- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; -- *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a; -- *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a; -- *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a; -- *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a; -- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; -- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; -- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; -- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; -- *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b; -- *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b; -- *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b; -- *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b; -- *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c; -- *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c; -- *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c; -- *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c; -- *(uint32_t *) (dst+2*dpitch+4*BYTES_PER_UINT32_T) = src5c; -- *(uint32_t *) (dst+2*dpitch+5*BYTES_PER_UINT32_T) = src6c; -- *(uint32_t *) (dst+2*dpitch+6*BYTES_PER_UINT32_T) = src7c; -- *(uint32_t *) (dst+2*dpitch+7*BYTES_PER_UINT32_T) = src8c; -- *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d; -- *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d; -- *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d; -- *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d; -- *(uint32_t *) (dst+3*dpitch+4*BYTES_PER_UINT32_T) = src5d; -- *(uint32_t *) (dst+3*dpitch+5*BYTES_PER_UINT32_T) = src6d; -- *(uint32_t *) (dst+3*dpitch+6*BYTES_PER_UINT32_T) = src7d; -- *(uint32_t *) (dst+3*dpitch+7*BYTES_PER_UINT32_T) = src8d; -- return TRUE; -- } -- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { -- uint16_t src1a = *(uint16_t *) (src+0*spitch+0); -- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); -- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); -- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); -- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); -- uint32_t src6a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); -- uint32_t src7a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); -- uint32_t src8a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); -- uint16_t src9a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); -- uint16_t src1b = *(uint16_t *) (src+1*spitch+0); -- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); -- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); -- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); -- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); -- uint32_t src6b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); -- uint32_t src7b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); -- uint32_t src8b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); -- uint16_t src9b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); -- uint16_t src1c = *(uint16_t *) (src+2*spitch+0); -- uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); -- uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); -- uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); -- uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); -- uint32_t src6c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); -- uint32_t src7c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); -- uint32_t src8c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); -- uint16_t src9c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); -- uint16_t src1d = *(uint16_t *) (src+3*spitch+0); -- uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); -- uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); -- uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); -- uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); -- uint32_t src6d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); -- uint32_t src7d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); -- uint32_t src8d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); -- uint16_t src9d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); -- *(uint16_t *) (dst+0*dpitch+0) = src1a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7a; -- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8a; -- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9a; -- *(uint16_t *) (dst+1*dpitch+0) = src1b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7b; -- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8b; -- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9b; -- *(uint16_t *) (dst+2*dpitch+0) = src1c; -- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c; -- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c; -- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c; -- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c; -- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6c; -- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7c; -- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8c; -- *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9c; -- *(uint16_t *) (dst+3*dpitch+0) = src1d; -- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d; -- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d; -- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d; -- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5d; -- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6d; -- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7d; -- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8d; -- *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9d; -- return TRUE; -- } -- else { -- // Don't bother unrolling loops, since that won't help for more than around 8 operations. -- // Instead, just call multiple fixed functions. -- if (xdir >= 0) { -- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- } else { -- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); -- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); -- } -- return TRUE; -- } -- break; -- // TODO: Add more alignment checks for 32 pixel-wide cases for performance reasons? -- // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,XXX)) and related cases could make a big difference here... -- case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { -- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); -- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); -- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src3c = vld1q_u32((uint32_t *)(src+2*spitch+2*BYTES_PER_UINT32X4_T)); -- uint32x4_t src4c = vld1q_u32((uint32_t *)(src+2*spitch+3*BYTES_PER_UINT32X4_T)); -- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); -- uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T)); -- uint32x4_t src3d = vld1q_u32((uint32_t *)(src+3*spitch+2*BYTES_PER_UINT32X4_T)); -- uint32x4_t src4d = vld1q_u32((uint32_t *)(src+3*spitch+3*BYTES_PER_UINT32X4_T)); -- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); -- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); -- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); -- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); -- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); -- vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c); -- vst1q_u32((uint32_t *)(dst+2*dpitch+2*BYTES_PER_UINT32X4_T),src3c); -- vst1q_u32((uint32_t *)(dst+2*dpitch+3*BYTES_PER_UINT32X4_T),src4c); -- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); -- vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d); -- vst1q_u32((uint32_t *)(dst+3*dpitch+2*BYTES_PER_UINT32X4_T),src3d); -- vst1q_u32((uint32_t *)(dst+3*dpitch+3*BYTES_PER_UINT32X4_T),src4d); -- return TRUE; -- } -- break; -- } -+ } - - return FALSE; - } -@@ -1924,10 +872,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int - if (rowsOverlap) - { - if (w > 64) { -- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); -+ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); -+ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); - } - else if (w == 64) { -- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); -+ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); -+ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); - } - else { - DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP); -@@ -1936,10 +886,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int - else - { - if (w > 64) { -- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); -+ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); -+ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); - } - else if (w == 64) { -- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); -+ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); -+ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); - } - else { - DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP); -@@ -1973,7 +925,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i - if (xdir >= 0 || !rowsOverlap) { - if (w >= 128) { - BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); -- neon_memcpy(dst, src, w); -+ //neon_memcpy(dst, src, w); -+ memcpy(dst, src, w); - UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); - } - else -@@ -1982,7 +935,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i - else { - if (w >= 128) { - BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); -- neon_memmove(dst, src, w); -+ //neon_memmove(dst, src, w); -+ memmove(dst, src, w); - UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); - } - else -@@ -2029,7 +983,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, - if (xdir >= 0 || !rowsOverlap) { - if (w >= 42) { - BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); -- neon_memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL); -+ //neon_memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL); -+ memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL); - UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); - } - else -@@ -2038,7 +993,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, - else { - if (w >= 42) { - BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); -- neon_memmove(dst, src, w * BYTES_PER_24BPP_PIXEL); -+ //neon_memmove(dst, src, w * BYTES_PER_24BPP_PIXEL); -+ memmove(dst, src, w * BYTES_PER_24BPP_PIXEL); - UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); - } - else -diff --git git/src/neon_memcpy.S git/src/neon_memcpy.S -deleted file mode 100644 -index 5ecc5ce..0000000 ---- git/src/neon_memcpy.S -+++ /dev/null -@@ -1,549 +0,0 @@ --/*************************************************************************** -- Copyright (c) 2009, Code Aurora Forum. All rights reserved. -- -- Redistribution and use in source and binary forms, with or without -- modification, are permitted provided that the following conditions are met: -- * Redistributions of source code must retain the above copyright -- notice, this list of conditions and the following disclaimer. -- * Redistributions in binary form must reproduce the above copyright -- notice, this list of conditions and the following disclaimer in the -- documentation and/or other materials provided with the distribution. -- * Neither the name of Code Aurora nor the names of its contributors may -- be used to endorse or promote products derived from this software -- without specific prior written permission. -- -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -- POSSIBILITY OF SUCH DAMAGE. -- ***************************************************************************/ -- --/*************************************************************************** -- Neon memcpy: Attempts to do a memcpy with Neon registers if possible, -- Inputs: -- dest: The destination buffer -- src: The source buffer -- n: The size of the buffer to transfer -- Outputs: -- --***************************************************************************/ -- --/* -- * General note: -- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP -- * However, it looks like the 2006 CodeSourcery Assembler has issues generating -- * the correct object code for VPOP, resulting in horrific stack crashes. -- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB, -- * and VPOP->VLDMIA. We can revert this back once we update our toolchain. -- * -- * Also, VSHL swaps the source register and the shift-amount register -- * around in 2006-q3. I've coded this incorrectly so it turns out correct -- * in the object code, but we'll need to undo that later... -- */ -- -- .code 32 -- .align 4 -- .globl neon_memcpy -- .func -- --neon_memcpy: -- /* -- * First, make sure we're not copying < 4 bytes. If so, we'll -- * just handle it here. -- */ --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- stmdb sp!, {r0} --#else -- push {r0} --#endif -- cmp r2, #4 -- bgt neon_gt_4 -- /* Copy 0-4 bytes, if needed, and return.*/ -- cmp r2, #0 --neon_smallcopy_loop: -- beq neon_smallcopy_done -- ldrb r12, [r1], #1 -- subs r2, r2, #1 -- strb r12, [r0], #1 -- b neon_smallcopy_loop --neon_smallcopy_done: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- ldmia sp!, {r0} --#else -- pop {r0} --#endif -- bx lr -- -- /* Copy 4 or more bytes*/ --neon_gt_4: -- /* Preload what we can...*/ -- pld [r0,#0] -- pld [r1,#0] --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- stmdb sp!, {r4-r5} --#else -- push {r4-r5} --#endif -- --neon_check_align: -- /* Check normal word alignment for target. */ -- ands r12, r0, #0x3 -- beq source_alignment_check -- -- /* -- * Target is not aligned. Step through until we get that -- * word-aligned. This works better than a loop, according -- * to our pipeline modeler. -- */ -- cmp r12, #2 -- ldrb r3, [r1], #1 -- ldrleb r4, [r1], #1 -- ldrltb r5, [r1], #1 -- rsb r12, r12, #4 -- sub r2, r2, r12 -- strb r3, [r0], #1 -- strleb r4, [r0], #1 -- strltb r5, [r0], #1 -- --source_alignment_check: -- ands r12, r1, #0x3 -- bne neon_memcpy_nonaligned /* Source is not word aligned.*/ --neon_try_16_align: -- cmp r2, #64 -- blt neon_align_route -- /* This is where we try 16-byte alignment. */ -- ands r12, r0, #0xf -- beq neon_align_route -- rsb r12, r12, #16 --neon_16_start: -- sub r2, r2, r12 -- lsrs r3, r12, #2 --neon_align_16_4: -- ldr r4, [r1], #4 -- subs r3, r3, #1 -- str r4, [r0], #4 -- bne neon_align_16_4 --neon_align_route: -- /* In this case, both source and target are word-aligned. */ -- cmp r2, #32768 -- bge neon_copy_128p_a -- cmp r2, #256 -- bge neon_copy_128_a -- cmp r2, #64 -- bge neon_copy_32_a -- b neon_copy_finish_a -- nop --neon_copy_128p_a: -- /* We'll copy blocks 128-bytes at a time, but try to call pld to -- * load in the next page, if possible. -- */ --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vstmdb sp!, {q4-q7} --#else -- vpush {q4-q7} --#endif -- mov r12, r2, lsr #7 --neon_copy_128p_loop_a: -- vld1.32 {q0, q1}, [r1]! -- vld1.32 {q2, q3}, [r1]! -- vld1.32 {q4, q5}, [r1]! -- vld1.32 {q6, q7}, [r1]! -- pld [r1, #0] -- pld [r1, #1024] -- vst1.32 {q0, q1}, [r0]! -- vst1.32 {q2, q3}, [r0]! -- vst1.32 {q4, q5}, [r0]! -- vst1.32 {q6, q7}, [r0]! -- subs r12, r12, #1 -- bne neon_copy_128p_loop_a --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vldmia sp!, {q4-q7} --#else -- vpop {q4-q7} --#endif -- ands r2, r2, #0x7f -- beq neon_end -- cmp r2, #32 -- blt neon_copy_finish_a -- b neon_copy_32_a -- /* Copy blocks of 128-bytes (word-aligned) at a time*/ --neon_copy_128_a: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vstmdb sp!, {q4-q7} --#else -- vpush {q4-q7} --#endif -- /* -- * Move to a 1-s based countdown to determine when to loop. That -- * allows the subs to set the Z flag without having to explicitly -- * call cmp to a value. -- */ -- mov r12, r2, lsr #7 --neon_copy_128_loop_a: -- vld1.32 {q0, q1}, [r1]! -- vld1.32 {q2, q3}, [r1]! -- vld1.32 {q4, q5}, [r1]! -- vld1.32 {q6, q7}, [r1]! -- pld [r1, #0] -- pld [r1, #128] -- vst1.32 {q0, q1}, [r0]! -- vst1.32 {q2, q3}, [r0]! -- vst1.32 {q4, q5}, [r0]! -- vst1.32 {q6, q7}, [r0]! -- subs r12, r12, #1 -- pld [r0, #0] -- pld [r0, #128] -- bne neon_copy_128_loop_a --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vldmia sp!, {q4-q7} --#else -- vpop {q4-q7} --#endif -- ands r2, r2, #0x7f -- beq neon_end -- cmp r2, #32 -- blt neon_copy_finish_a -- /* Copy blocks of 32-bytes (word aligned) at a time*/ --neon_copy_32_a: -- mov r12, r2, lsr #5 --neon_copy_32_loop_a: -- vld1.32 {q0,q1}, [r1]! -- subs r12, r12, #1 -- pld [r1,#0] -- vst1.32 {q0,q1}, [r0]! -- bne neon_copy_32_loop_a -- ands r2, r2, #0x1f -- beq neon_end --neon_copy_finish_a: --neon_copy_16_a: -- movs r12, r2, lsr #4 -- beq neon_copy_8_a --neon_copy_16_a_loop: -- vld1.32 {q0}, [r1]! -- subs r12, r12, #1 -- vst1.32 {q0}, [r0]! -- bne neon_copy_16_a_loop -- ands r2, r2, #0xf -- beq neon_end --neon_copy_8_a: -- cmp r2, #8 -- blt neon_copy_4_a -- ldm r1!, {r4-r5} -- subs r2, r2, #8 -- stm r0!, {r4-r5} -- /* Copy 4-bytes of word-aligned data at a time*/ --neon_copy_4_a: -- cmp r2, #4 -- blt neon_copy_finish -- ldr r4, [r1], #4 -- subs r2, r2, #4 -- str r4, [r0], #4 -- b neon_copy_finish -- -- /* -- * Handle unaligned data. The basic concept here is that we'll -- * try to pull out enough data from the source to get that word- -- * aligned, then do our writes word-aligned, storing the difference -- * in a register, and shifting the data as needed. -- */ --neon_memcpy_nonaligned: -- /* -- * If this is <8 bytes, it makes more sense to just copy it -- * quickly instead of incurring all kinds of overhead. -- */ -- cmp r2, #8 /* Let's try this...*/ -- ble neon_copy_finish -- /* -- * This is where we'll pull out either 1, 2, or 3 bytes of data -- * from the source as needed to align it, then store off those -- * bytes in r4. When we read in the (now) aligned data from the -- * source, we'll shift the bytes and AND in the r4 data, then write -- * to the target aligned. -- * -- * The conditional ldr calls work slightly faster than the -- * previous method, confirmed by our pipeline modeler. -- */ --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- stmdb sp!, {r6-r9} --#else -- push {r6-r9} --#endif -- cmp r12, #2 -- ldrb r4, [r1], #1 -- ldrleb r5, [r1], #1 -- ldrltb r6, [r1], #1 -- rsb r8, r12, #4 -- sub r2, r2, r8 -- lsl r8, r8, #3 -- orrle r4, r4, r5, lsl #8 -- orrlt r4, r4, r6, lsl #16 -- rsb r9, r8, #32 -- -- cmp r2, #64 -- blt neon_unaligned_route -- ands r12, r0, #0xf -- beq neon_unaligned_route -- rsb r12, r12, #16 --neon_16_start_u: -- sub r2, r2, r12 -- lsrs r6, r12, #2 --neon_align_16_4_u: -- ldr r5, [r1], #4 -- subs r6, r6, #1 -- orr r4, r4, r5, lsl r8 -- str r4, [r0], #4 -- mov r4, r5, lsr r9 -- bne neon_align_16_4_u --neon_unaligned_route: -- /* Decide which loop block to branch to.*/ -- cmp r2, #256 -- bge neon_copy_64_u -- cmp r2, #64 -- bge neon_copy_32_u -- b neon_copy_finish_u -- /* Copy data in 64-byte blocks.*/ --neon_copy_64_u: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vstmdb sp!, {q4} -- vstmdb sp!, {q5-q8} --#else -- vpush {q4} -- vpush {q5-q8} --#endif -- /* We'll need this for the q register shift later.*/ -- vdup.u32 q8, r8 -- /* -- * As above, we determine how many times we can go through the -- * 64-byte copy loop, then countdown. -- */ -- mov r12, r2, lsr #6 -- and r2, r2, #0x3f --neon_copy_64_u_loop: -- /* Load 64-bytes into q4-q7.*/ -- vld1.32 {q4, q5}, [r1]! -- vld1.32 {q6, q7}, [r1]! -- /* -- * Shift q0-q3 right so everything but the data we need due to the -- * alignment falls off the right-hand side. The branching -- * is needed, since vshr requires the shift to be an immediate -- * value. -- */ -- lsls r5, r8, #28 -- bcc neon_copy_64_u_b8 -- bpl neon_copy_64_u_b16 -- vshr.u64 q0, q4, #40 -- vshr.u64 q1, q5, #40 -- vshr.u64 q2, q6, #40 -- vshr.u64 q3, q7, #40 -- b neon_copy_64_unify --neon_copy_64_u_b8: -- vshr.u64 q0, q4, #56 -- vshr.u64 q1, q5, #56 -- vshr.u64 q2, q6, #56 -- vshr.u64 q3, q7, #56 -- b neon_copy_64_unify --neon_copy_64_u_b16: -- vshr.u64 q0, q4, #48 -- vshr.u64 q1, q5, #48 -- vshr.u64 q2, q6, #48 -- vshr.u64 q3, q7, #48 --neon_copy_64_unify: -- /* -- * Shift q4-q7 left by r8 bits to take the alignment into -- * account. -- */ --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vshl.u64 q4, q8, q4 -- vshl.u64 q5, q8, q5 -- vshl.u64 q6, q8, q6 -- vshl.u64 q7, q8, q7 --#else -- vshl.u64 q4, q4, q8 -- vshl.u64 q5, q5, q8 -- vshl.u64 q6, q6, q8 -- vshl.u64 q7, q7, q8 --#endif -- /* -- * The data in s14 will be needed for the next loop iteration. Move -- * that to r5. -- */ -- vmov r5, s14 -- /* We'll vorr the shifted data with the data that needs to move back.*/ -- vorr d9, d9, d0 -- /* Copy the data from the previous loop into s14.*/ -- vmov s14, r4 -- vorr d10, d10, d1 -- vorr d11, d11, d2 -- vorr d12, d12, d3 -- vorr d13, d13, d4 -- vorr d14, d14, d5 -- vorr d15, d15, d6 -- vorr d8, d8, d7 -- subs r12, r12, #1 -- pld [r1, #0] -- pld [r1, #128] -- /* Save off the r5 data into r4 for the next iteration.*/ -- mov r4, r5 -- vst1.32 {q4, q5}, [r0]! -- vst1.32 {q6, q7}, [r0]! -- pld [r0, #0] -- pld [r0, #128] -- bne neon_copy_64_u_loop --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vldmia sp!, {q5-q8} -- vldmia sp!, {q4} --#else -- vpop {q5-q8} -- vpop {q4} --#endif -- cmp r2, #32 -- bge neon_copy_32_u -- b neon_copy_finish_u --neon_copy_32_u: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vstmdb sp!, {q4} --#else -- vpush {q4} --#endif -- vdup.u32 q4, r8 -- mov r12, r2, lsr #5 -- and r2, r2, #0x1f --neon_copy_32_u_loop: -- vld1.32 {q0, q1}, [r1]! -- lsls r5, r8, #28 -- bcc neon_copy_32_u_b8 -- bpl neon_copy_32_u_b16 -- vshr.u64 q2, q0, #40 -- vshr.u64 q3, q1, #40 -- b neon_copy_32_unify --neon_copy_32_u_b8: -- vshr.u64 q2, q0, #56 -- vshr.u64 q3, q1, #56 -- b neon_copy_32_unify --neon_copy_32_u_b16: -- vshr.u64 q2, q0, #48 -- vshr.u64 q3, q1, #48 --neon_copy_32_unify: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vshl.u64 q0, q4, q0 -- vshl.u64 q1, q4, q1 --#else -- vshl.u64 q0, q0, q4 -- vshl.u64 q1, q1, q4 --#endif -- vmov r5, s14 -- vorr d1, d1, d4 -- vmov s14, r4 -- vorr d2, d2, d5 -- vorr d3, d3, d6 -- vorr d0, d0, d7 -- subs r12, r12, #1 -- pld [r1, #0] -- mov r4, r5 -- vst1.32 {q0, q1}, [r0]! -- bne neon_copy_32_u_loop --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vldmia sp!, {q4} --#else -- vpop {q4} --#endif --neon_copy_finish_u: --neon_copy_16_u: -- movs r12, r2, lsr #4 -- beq neon_copy_8_u -- vdup.u32 q2, r8 -- and r2, r2, #0xf --neon_copy_16_u_loop: -- vld1.32 {q0}, [r1]! -- lsls r5, r8, #28 -- bcc neon_copy_16_u_b8 -- bpl neon_copy_16_u_b16 -- vshr.u64 q1, q0, #40 -- b neon_copy_16_unify --neon_copy_16_u_b8: -- vshr.u64 q1, q0, #56 -- b neon_copy_16_unify --neon_copy_16_u_b16: -- vshr.u64 q1, q0, #48 --neon_copy_16_unify: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vshl.u64 q0, q2, q0 --#else -- vshl.u64 q0, q0, q2 --#endif -- vmov r5, s6 -- vorr d1, d1, d2 -- vmov s6, r4 -- vorr d0, d0, d3 -- subs r12, r12, #1 -- mov r4, r5 -- vst1.32 {q0}, [r0]! -- bne neon_copy_16_u_loop --neon_copy_8_u: -- cmp r2, #8 -- blt neon_copy_4_u -- ldm r1!, {r6-r7} -- subs r2, r2, #8 -- orr r4, r4, r6, lsl r8 -- mov r5, r6, lsr r9 -- orr r5, r5, r7, lsl r8 -- stm r0!, {r4-r5} -- mov r4, r7, lsr r9 --neon_copy_4_u: -- cmp r2, #4 -- blt neon_copy_last_bits_u -- ldr r5, [r1], #4 -- subs r2, r2, #4 -- orr r4, r4, r5, lsl r8 -- str r4, [r0], #4 -- mov r4, r5, lsr r9 --neon_copy_last_bits_u: -- /* -- * Remember, r8 contains the size of the data in r4 in bits, -- * so to get to bytes we'll need to shift 3 places -- */ -- lsr r8, r8, #0x3 -- /* Write out the bytes stored in r4.*/ --neon_copy_last_bits_u_loop: -- strb r4, [r0], #1 -- subs r8, r8, #1 -- lsrne r4, r4, #8 -- bne neon_copy_last_bits_u_loop --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- ldmia sp!, {r6-r9} --#else -- pop {r6-r9} --#endif --neon_copy_finish: -- cmp r2, #0 -- beq neon_end -- /* -- * This just copies the data from source to target one byte -- * at a time. For some small values, this makes more sense. -- * Note that since this code copies data a byte at a time, -- * both the aligned and unaligned paths can use it. -- */ --neon_copy_finish_loop: -- ldrb r4, [r1], #1 -- subs r2, r2, #1 -- strb r4, [r0], #1 -- bne neon_copy_finish_loop --neon_end: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- ldmia sp!, {r4-r5} -- ldmia sp!, {r0} --#else -- pop {r4-r5} -- pop {r0} --#endif -- bx lr -- -- .endfunc -- .end -diff --git git/src/neon_memmove.S git/src/neon_memmove.S -deleted file mode 100644 -index 1bfe597..0000000 ---- git/src/neon_memmove.S -+++ /dev/null -@@ -1,939 +0,0 @@ --/*************************************************************************** -- Copyright (c) 2009, Code Aurora Forum. All rights reserved. -- -- Redistribution and use in source and binary forms, with or without -- modification, are permitted provided that the following conditions are met: -- * Redistributions of source code must retain the above copyright -- notice, this list of conditions and the following disclaimer. -- * Redistributions in binary form must reproduce the above copyright -- notice, this list of conditions and the following disclaimer in the -- documentation and/or other materials provided with the distribution. -- * Neither the name of Code Aurora nor the names of its contributors may -- be used to endorse or promote products derived from this software -- without specific prior written permission. -- -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -- POSSIBILITY OF SUCH DAMAGE. -- ***************************************************************************/ -- --/*************************************************************************** -- * Neon memmove: Attempts to do a memmove with Neon registers if possible, -- * Inputs: -- * dest: The destination buffer -- * src: The source buffer -- * n: The size of the buffer to transfer -- * Outputs: -- * -- ***************************************************************************/ -- --/* -- * General note: -- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP -- * However, it looks like the 2006 CodeSourcery Assembler has issues generating -- * the correct object code for VPOP, resulting in horrific stack crashes. -- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB, -- * and VPOP->VLDMIA. We can revert this back once we update our toolchain. -- * -- * Also, VSHL swaps the source register and the shift-amount register -- * around in 2006-q3. I've coded this incorrectly so it turns out correct -- * in the object code, but we'll need to undo that later... -- */ -- .code 32 -- .align 4 -- .globl neon_memmove -- .func -- --neon_memmove: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- stmdb sp!, {r0} --#else -- push {r0} --#endif -- -- /* -- * The requirements for memmove state that the function should -- * operate as if data were being copied from the source to a -- * buffer, then to the destination. This is to allow a user -- * to copy data from a source and target that overlap. -- * -- * We can't just do byte copies front-to-back automatically, since -- * there's a good chance we may have an overlap (why else would someone -- * intentionally use memmove then?). -- * -- * We'll break this into two parts. Front-to-back, or back-to-front -- * copies. -- */ --neon_memmove_cmf: -- cmp r0, r1 -- blt neon_front_to_back_copy -- bgt neon_back_to_front_copy -- b neon_memmove_done -- -- /* ############################################################# -- * Front to Back copy -- */ --neon_front_to_back_copy: -- /* -- * For small copies, just do a quick memcpy. We can do this for -- * front-to-back copies, aligned or unaligned, since we're only -- * doing 1 byte at a time... -- */ -- cmp r2, #4 -- bgt neon_f2b_gt4 -- cmp r2, #0 --neon_f2b_smallcopy_loop: -- beq neon_memmove_done -- ldrb r12, [r1], #1 -- subs r2, r2, #1 -- strb r12, [r0], #1 -- b neon_f2b_smallcopy_loop --neon_f2b_gt4: -- /* Preload what we can...*/ -- pld [r0,#0] -- pld [r1,#0] -- /* The window size is in r3. */ -- sub r3, r1, r0 --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- stmdb sp!, {r4-r6} --#else -- push {r4-r6} --#endif -- --neon_f2b_check_align: -- /* Check alignment. */ -- ands r12, r0, #0x3 -- beq neon_f2b_source_align_check -- cmp r12, #2 -- ldrb r4, [r1], #1 -- ldrleb r5, [r1], #1 -- ldrltb r6, [r1], #1 -- rsb r12, r12, #4 -- sub r2, r2, r12 -- strb r4, [r0], #1 -- strleb r5, [r0], #1 -- strltb r6, [r0], #1 -- --neon_f2b_source_align_check: -- ands r12, r1, #0x3 -- bne neon_f2b_nonaligned -- --neon_f2b_try_16_align: -- /* If we're >64, attempt to align on 16-bytes. Smaller amounts -- * don't seem to be worth handling. */ -- cmp r2, #64 -- blt neon_f2b_align_route -- /* This is where we try 16-byte alignment. */ -- ands r12, r0, #0xf -- beq neon_f2b_align_route -- rsb r12, r12, #16 --neon_f2b_16_start: -- sub r2, r2, r12 -- lsrs r5, r12, #2 --neon_f2b_align_16_4: -- ldr r4, [r1], #4 -- subs r5, r5, #1 -- str r4, [r0], #4 -- bne neon_f2b_align_16_4 --neon_f2b_align_route: -- /* ############################################################# -- * Front to Back copy - aligned -- */ -- /* -- * Note that we can't just route based on the size in r2. If that's -- * larger than the overlap window in r3, we could potentially -- * (and likely!) destroy data we're copying. -- */ -- cmp r2, r3 -- movle r12, r2 -- movgt r12, r3 -- cmp r12, #256 -- bge neon_f2b_copy_128_a -- cmp r12, #64 -- bge neon_f2b_copy_32_a -- cmp r12, #16 -- bge neon_f2b_copy_16_a -- cmp r12, #8 -- bge neon_f2b_copy_8_a -- cmp r12, #4 -- bge neon_f2b_copy_4_a -- b neon_f2b_copy_1_a --neon_f2b_copy_128_a: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vstmdb sp!, {q4-q7} --#else -- vpush {q4-q7} --#endif -- mov r12, r2, lsr #7 --neon_f2b_copy_128_a_loop: -- vld1.32 {q0,q1}, [r1]! -- vld1.32 {q2,q3}, [r1]! -- vld1.32 {q4,q5}, [r1]! -- vld1.32 {q6,q7}, [r1]! -- pld [r1, #0] -- pld [r1, #128] -- vst1.32 {q0,q1}, [r0]! -- vst1.32 {q2,q3}, [r0]! -- vst1.32 {q4,q5}, [r0]! -- vst1.32 {q6,q7}, [r0]! -- subs r12, r12, #1 -- pld [r0, #0] -- pld [r0, #128] -- bne neon_f2b_copy_128_a_loop --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vldmia sp!, {q4-q7} --#else -- vpop {q4-q7} --#endif -- ands r2, r2, #0x7f -- beq neon_f2b_finish -- cmp r2, #32 -- bge neon_f2b_copy_32_a -- b neon_f2b_copy_finish_a --neon_f2b_copy_32_a: -- mov r12, r2, lsr #5 --neon_f2b_copy_32_a_loop: -- vld1.32 {q0,q1}, [r1]! -- subs r12, r12, #1 -- pld [r1, #0] -- vst1.32 {q0,q1}, [r0]! -- bne neon_f2b_copy_32_a_loop -- ands r2, r2, #0x1f -- beq neon_f2b_finish --neon_f2b_copy_finish_a: --neon_f2b_copy_16_a: -- movs r12, r2, lsr #4 -- beq neon_f2b_copy_8_a --neon_f2b_copy_16_a_loop: -- vld1.32 {q0}, [r1]! -- subs r12, r12, #1 -- vst1.32 {q0}, [r0]! -- bne neon_f2b_copy_16_a_loop -- ands r2, r2, #0xf -- beq neon_f2b_finish --neon_f2b_copy_8_a: -- cmp r2, #8 -- blt neon_f2b_copy_4_a -- ldm r1!, {r4-r5} -- subs r2, r2, #8 -- stm r0!, {r4-r5} --neon_f2b_copy_4_a: -- cmp r2, #4 -- blt neon_f2b_copy_1_a -- ldr r4, [r1], #4 -- subs r2, r2, #4 -- str r4, [r0], #4 --neon_f2b_copy_1_a: -- cmp r2, #0 -- beq neon_f2b_finish --neon_f2b_copy_1_a_loop: -- ldrb r12, [r1], #1 -- subs r2, r2, #1 -- strb r12, [r0], #1 -- bne neon_f2b_copy_1_a_loop -- b neon_f2b_finish -- -- /* ############################################################# -- * Front to Back copy - unaligned -- */ --neon_f2b_nonaligned: -- /* -- * For sizes < 8, does it really make sense to do the whole shift -- * party? Note that we DON'T want to call neon_f2b_copy_1_u, -- * since we'll end up trying to pop r8-r11, and we DON'T want -- * to do that... -- */ -- cmp r2, #8 -- ble neon_f2b_copy_1_a -- --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- stmdb sp!, {r7-r9} --#else -- push {r7-r9} --#endif -- cmp r12, #2 -- ldrb r4, [r1], #1 -- ldrleb r5, [r1], #1 -- ldrltb r6, [r1], #1 -- rsb r8, r12, #4 -- sub r2, r2, r8 -- lsl r8, r8, #3 -- orrle r4, r4, r5, lsl #8 -- orrlt r4, r4, r6, lsl #16 -- rsb r9, r8, #32 -- /* -- * r4 = overflow bits -- * r8 = # of bits we copied into the r4 register to align source. -- * r9 = 32 - r8 -- * r12 = Index counter for each size, so we determine how many times -- * the given size will go into r2, then count down that # of -- * times in r12. -- */ -- cmp r2, #64 -- blt neon_f2b_unaligned_route -- ands r12, r0, #0xf -- beq neon_f2b_unaligned_route -- cmp r3, #4 -- blt neon_f2b_unaligned_route -- rsb r12, r12, #16 --neon_f2b_16_start_u: -- sub r2, r2, r12 -- lsrs r6, r12, #2 --neon_f2b_align_16_4_u: -- ldr r5, [r1], #4 -- subs r6, r6, #1 -- orr r4, r4, r5, lsl r8 -- str r4, [r0], #4 -- mov r4, r5, lsr r9 -- bne neon_f2b_align_16_4_u --neon_f2b_unaligned_route: -- cmp r2, r3 -- movle r12, r2 -- movgt r12, r3 -- cmp r12, #256 -- bge neon_f2b_copy_64_u -- cmp r12, #64 -- bge neon_f2b_copy_32_u -- cmp r12, #16 -- bge neon_f2b_copy_16_u -- cmp r12, #8 -- bge neon_f2b_copy_8_u -- cmp r12, #4 -- bge neon_f2b_copy_4_u -- b neon_f2b_last_bits_u --neon_f2b_copy_64_u: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vstmdb sp!, {q4} -- vstmdb sp!, {q5-q8} --#else -- vpush {q4} -- vpush {q5-q8} --#endif -- vdup.u32 q8, r8 -- mov r12, r2, lsr #6 -- and r2, r2, #0x3f --neon_f2b_copy_64_u_loop: -- vld1.32 {q4, q5}, [r1]! -- vld1.32 {q6, q7}, [r1]! -- lsls r5, r8, #28 -- bcc neon_f2b_copy_64_u_b8 -- bpl neon_f2b_copy_64_u_b16 -- vshr.u64 q0, q4, #40 -- vshr.u64 q1, q5, #40 -- vshr.u64 q2, q6, #40 -- vshr.u64 q3, q7, #40 -- b neon_f2b_copy_64_unify --neon_f2b_copy_64_u_b8: -- vshr.u64 q0, q4, #56 -- vshr.u64 q1, q5, #56 -- vshr.u64 q2, q6, #56 -- vshr.u64 q3, q7, #56 -- b neon_f2b_copy_64_unify --neon_f2b_copy_64_u_b16: -- vshr.u64 q0, q4, #48 -- vshr.u64 q1, q5, #48 -- vshr.u64 q2, q6, #48 -- vshr.u64 q3, q7, #48 --neon_f2b_copy_64_unify: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vshl.u64 q4, q8, q4 -- vshl.u64 q5, q8, q5 -- vshl.u64 q6, q8, q6 -- vshl.u64 q7, q8, q7 --#else -- vshl.u64 q4, q4, q8 -- vshl.u64 q5, q5, q8 -- vshl.u64 q6, q6, q8 -- vshl.u64 q7, q7, q8 --#endif -- vmov r5, s14 -- vorr d9, d9, d0 -- vmov s14, r4 -- vorr d10, d10, d1 -- vorr d11, d11, d2 -- vorr d12, d12, d3 -- vorr d13, d13, d4 -- vorr d14, d14, d5 -- vorr d15, d15, d6 -- vorr d8, d8, d7 -- subs r12, r12, #1 -- pld [r1, #0] -- pld [r1, #128] -- mov r4, r5 -- vst1.32 {q4, q5}, [r0]! -- vst1.32 {q6, q7}, [r0]! -- pld [r0, #0] -- pld [r0, #128] -- bne neon_f2b_copy_64_u_loop --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vldmia sp!, {q5-q8} -- vldmia sp!, {q4} --#else -- vpop {q5-q8} -- vpop {q4} --#endif -- cmp r2, #32 -- bge neon_f2b_copy_32_u -- b neon_f2b_copy_finish_u --neon_f2b_copy_32_u: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vstmdb sp!, {q4} --#else -- vpush {q4} --#endif -- vdup.u32 q4, r8 -- mov r12, r2, lsr #5 -- and r2, r2, #0x1f --neon_f2b_copy_32_u_loop: -- vld1.32 {q0, q1}, [r1]! -- lsls r5, r8, #28 -- bcc neon_f2b_copy_32_u_b8 -- bpl neon_f2b_copy_32_u_b16 -- vshr.u64 q2, q0, #40 -- vshr.u64 q3, q1, #40 -- b neon_f2b_copy_32_unify --neon_f2b_copy_32_u_b8: -- vshr.u64 q2, q0, #56 -- vshr.u64 q3, q1, #56 -- b neon_f2b_copy_32_unify --neon_f2b_copy_32_u_b16: -- vshr.u64 q2, q0, #48 -- vshr.u64 q3, q1, #48 --neon_f2b_copy_32_unify: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vshl.u64 q0, q4, q0 -- vshl.u64 q1, q4, q1 --#else -- vshl.u64 q0, q0, q4 -- vshl.u64 q1, q1, q4 --#endif -- vmov r5, s14 -- vorr d1, d1, d4 -- vmov s14, r4 -- vorr d2, d2, d5 -- vorr d3, d3, d6 -- vorr d0, d0, d7 -- subs r12, r12, #1 -- pld [r1, #0] -- mov r4, r5 -- vst1.32 {q0, q1}, [r0]! -- bne neon_f2b_copy_32_u_loop --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vldmia sp!, {q4} --#else -- vpop {q4} --#endif --neon_f2b_copy_finish_u: --neon_f2b_copy_16_u: -- movs r12, r2, lsr #4 -- beq neon_f2b_copy_8_u -- vdup.u32 q2, r8 -- and r2, r2, #0xf --neon_f2b_copy_16_u_loop: -- vld1.32 {q0}, [r1]! -- lsls r5, r8, #28 -- bcc neon_f2b_copy_16_u_b8 -- bpl neon_f2b_copy_16_u_b16 -- vshr.u64 q1, q0, #40 -- b neon_f2b_copy_16_unify --neon_f2b_copy_16_u_b8: -- vshr.u64 q1, q0, #56 -- b neon_f2b_copy_16_unify --neon_f2b_copy_16_u_b16: -- vshr.u64 q1, q0, #48 --neon_f2b_copy_16_unify: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vshl.u64 q0, q2, q0 --#else -- vshl.u64 q0, q0, q2 --#endif -- vmov r5, s6 -- vorr d1, d1, d2 -- vmov s6, r4 -- vorr d0, d0, d3 -- subs r12, r12, #1 -- mov r4, r5 -- vst1.32 {q0}, [r0]! -- bne neon_f2b_copy_16_u_loop --neon_f2b_copy_8_u: -- cmp r2, #8 -- blt neon_f2b_copy_4_u -- ldm r1!, {r6-r7} -- subs r2, r2, #8 -- orr r4, r4, r6, lsl r8 -- mov r5, r6, lsr r9 -- orr r5, r5, r7, lsl r8 -- stm r0!, {r4-r5} -- mov r4, r7, lsr r9 --neon_f2b_copy_4_u: -- cmp r2, #4 -- blt neon_f2b_last_bits_u -- ldr r5, [r1], #4 -- subs r2, r2, #4 -- orr r4, r4, r5, lsl r8 -- str r4, [r0], #4 -- mov r4, r5, lsr r9 --neon_f2b_last_bits_u: -- lsr r8, r8, #0x3 --neon_f2b_last_bits_u_loop: -- strb r4, [r0], #1 -- subs r8, r8, #1 -- lsr r4, r4, #8 -- bne neon_f2b_last_bits_u_loop --neon_f2b_copy_1_u: -- cmp r2, #0 -- beq neon_f2b_finish_u --neon_f2b_copy_1_u_loop: -- ldrb r12, [r1], #1 -- subs r2, r2, #1 -- strb r12, [r0], #1 -- bne neon_f2b_copy_1_u_loop --neon_f2b_finish_u: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- ldmia sp!, {r7-r9} --#else -- pop {r7-r9} --#endif -- /* ############################################################# -- * Front to Back copy - finish -- */ --neon_f2b_finish: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- ldmia sp!, {r4-r6} --#else -- pop {r4-r6} --#endif -- b neon_memmove_done -- -- /* ############################################################# -- * Back to Front copy -- */ --neon_back_to_front_copy: -- /* -- * Here, we'll want to shift to the end of the buffers. This -- * actually points us one past where we need to go, but since -- * we'll pre-decrement throughout, this will be fine. -- */ -- add r0, r0, r2 -- add r1, r1, r2 -- cmp r2, #4 -- bgt neon_b2f_gt4 -- cmp r2, #0 --neon_b2f_smallcopy_loop: -- beq neon_memmove_done -- ldrb r12, [r1, #-1]! -- subs r2, r2, #1 -- strb r12, [r0, #-1]! -- b neon_b2f_smallcopy_loop --neon_b2f_gt4: -- pld [r0, #0] -- pld [r1, #0] -- /* -- * The minimum of the overlap window size and the copy size -- * is in r3. -- */ -- sub r3, r0, r1 --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- stmdb sp!, {r4-r5} --#else -- push {r4-r5} --#endif -- -- /* -- * Check alignment. Since we'll pre-decrement as we step thru, we'll -- * need to make sure we're on word-alignment. -- */ --neon_b2f_check_align: -- ands r12, r0, #0x3 -- beq neon_b2f_source_align_check -- sub r2, r2, r12 --neon_b2f_shift_align: -- ldrb r4, [r1, #-1]! -- subs r12, r12, #1 -- strb r4, [r0, #-1]! -- bne neon_b2f_shift_align --neon_b2f_source_align_check: -- ands r4, r1, #0x3 -- bne neon_b2f_nonaligned -- --neon_b2f_try_16_align: -- /* If we're >64, attempt to align on 16-bytes. Smaller amounts -- * don't seem to be worth handling. */ -- cmp r2, #64 -- blt neon_b2f_align_route -- ands r12, r0, #0xf -- beq neon_b2f_align_route -- /* In this case, r12 has the number of bytes to roll backward. */ --neon_b2f_16_start: -- sub r2, r2, r12 -- lsrs r5, r12, #2 --neon_b2f_align_16_4: -- ldr r4, [r1, #-4]! -- subs r5, r5, #1 -- str r4, [r0, #-4]! -- bne neon_b2f_align_16_4 --neon_b2f_align_route: -- /* -- * ############################################################# -- * Back to Front copy - aligned -- */ -- cmp r2, r3 -- movle r12, r2 -- movgt r12, r3 -- cmp r12, #256 -- bge neon_b2f_copy_128_a -- cmp r12, #64 -- bge neon_b2f_copy_32_a -- cmp r12, #8 -- bge neon_b2f_copy_8_a -- cmp r12, #4 -- bge neon_b2f_copy_4_a -- b neon_b2f_copy_1_a --neon_b2f_copy_128_a: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vstmdb sp!, {q4-q7} --#else -- vpush {q4-q7} --#endif -- movs r12, r2, lsr #7 -- /* -- * This irks me. There MUST be a better way to read these in and -- * scan the register backward instead of making it go forward. Then -- * we need to do two subtractions... -- */ --neon_b2f_copy_128_a_loop: -- sub r1, r1, #128 -- sub r0, r0, #128 -- vld1.32 {q0, q1}, [r1]! -- vld1.32 {q2, q3}, [r1]! -- vld1.32 {q4, q5}, [r1]! -- vld1.32 {q6, q7}, [r1]! -- pld [r1, #-128] -- pld [r1, #-256] -- vst1.32 {q0, q1}, [r0]! -- vst1.32 {q2, q3}, [r0]! -- vst1.32 {q4, q5}, [r0]! -- vst1.32 {q6, q7}, [r0]! -- subs r12, r12, #1 -- pld [r0, #-128] -- pld [r0, #-256] -- sub r1, r1, #128 -- sub r0, r0, #128 -- bne neon_b2f_copy_128_a_loop --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vldmia sp!, {q4-q7} --#else -- vpop {q4-q7} --#endif -- ands r2, r2, #0x7f -- beq neon_b2f_finish -- cmp r2, #32 -- bge neon_b2f_copy_32_a -- b neon_b2f_copy_finish_a --neon_b2f_copy_32_a: -- mov r12, r2, lsr #5 --neon_b2f_copy_32_a_loop: -- sub r1, r1, #32 -- sub r0, r0, #32 -- vld1.32 {q0,q1}, [r1] -- subs r12, r12, #1 -- vst1.32 {q0,q1}, [r0] -- pld [r1, #0] -- bne neon_b2f_copy_32_a_loop -- ands r2, r2, #0x1f -- beq neon_b2f_finish --neon_b2f_copy_finish_a: --neon_b2f_copy_8_a: -- movs r12, r2, lsr #0x3 -- beq neon_b2f_copy_4_a --neon_b2f_copy_8_a_loop: -- ldmdb r1!, {r4-r5} -- subs r12, r12, #1 -- stmdb r0!, {r4-r5} -- bne neon_b2f_copy_8_a_loop -- and r2, r2, #0x7 --neon_b2f_copy_4_a: -- movs r12, r2, lsr #0x2 -- beq neon_b2f_copy_1_a -- and r2, r2, #0x3 --neon_b2f_copy_4_a_loop: -- ldr r4, [r1, #-4]! -- subs r12, r12, #1 -- str r4, [r0, #-4]! -- bne neon_b2f_copy_4_a_loop --neon_b2f_copy_1_a: -- cmp r2, #0 -- beq neon_b2f_finish --neon_b2f_copy_1_a_loop: -- ldrb r12, [r1, #-1]! -- subs r2, r2, #1 -- strb r12, [r0, #-1]! -- bne neon_b2f_copy_1_a_loop -- -- /* ############################################################# -- * Back to Front copy - unaligned -- */ --neon_b2f_nonaligned: -- /* -- * For sizes < 8, does it really make sense to do the whole shift -- * party? -- */ -- cmp r2, #8 -- ble neon_b2f_copy_1_a --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- stmdb sp!, {r6-r11} --#else -- push {r6-r11} --#endif -- /* -- * r3 = max window size -- * r4 = overflow bytes -- * r5 = bytes we're reading into -- * r6 = # bytes we're off. -- * r10 = copy of r6 -- */ -- and r6, r1, #0x3 -- eor r4, r4, r4 -- mov r10, r6 --neon_b2f_realign: -- ldrb r5, [r1, #-1]! -- subs r6, r6, #1 -- orr r4, r5, r4, lsl #8 -- bne neon_b2f_realign -- /* -- * r10 = # of bits we copied into the r4 register to align source. -- * r11 = 32 - r10 -- * r12 = Index counter for each size, so we determine how many times -- * the given size will go into r2, then count down that # of -- * times in r12. -- */ -- sub r2, r2, r10 -- lsl r10, r10, #0x3 -- rsb r11, r10, #32 -- -- cmp r2, r3 -- movle r12, r2 -- movgt r12, r3 -- cmp r12, #256 -- bge neon_b2f_copy_64_u -- cmp r12, #64 -- bge neon_b2f_copy_32_u -- cmp r12, #8 -- bge neon_b2f_copy_8_u -- cmp r12, #4 -- bge neon_b2f_copy_4_u -- b neon_b2f_last_bits_u --neon_b2f_copy_64_u: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vstmdb sp!, {q4,q5} -- vstmdb sp!, {q6-q8} --#else -- vpush {q4,q5} -- vpush {q6-q8} --#endif -- add r7, r11, #32 -- movs r12, r2, lsr #6 -- vdup.u32 q8, r7 --neon_b2f_copy_64_u_loop: -- sub r1, r1, #64 -- sub r0, r0, #64 -- vld1.32 {q0, q1}, [r1]! -- vld1.32 {q2, q3}, [r1] -- sub r1, r1, #32 -- vmov q4, q0 -- vmov q5, q1 -- vmov q6, q2 -- vmov q7, q3 -- vmov r5, s0 -- mov r4, r4, lsl r11 -- lsls r6, r10, #28 -- bcc neon_b2f_copy_64_u_b8 -- bpl neon_b2f_copy_64_u_b16 -- vshr.u64 q0, q0, #24 -- vshr.u64 q1, q1, #24 -- vshr.u64 q2, q2, #24 -- vshr.u64 q3, q3, #24 -- b neon_b2f_copy_64_unify --neon_b2f_copy_64_u_b8: -- vshr.u64 q0, q0, #8 -- vshr.u64 q1, q1, #8 -- vshr.u64 q2, q2, #8 -- vshr.u64 q3, q3, #8 -- b neon_b2f_copy_64_unify --neon_b2f_copy_64_u_b16: -- vshr.u64 q0, q0, #16 -- vshr.u64 q1, q1, #16 -- vshr.u64 q2, q2, #16 -- vshr.u64 q3, q3, #16 --neon_b2f_copy_64_unify: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vshl.u64 q4, q8, q4 -- vshl.u64 q5, q8, q5 -- vshl.u64 q6, q8, q6 -- vshl.u64 q7, q8, q7 --#else -- vshl.u64 q4, q4, q8 -- vshl.u64 q5, q5, q8 -- vshl.u64 q6, q6, q8 -- vshl.u64 q7, q7, q8 --#endif -- vmov s17, r4 -- vorr d7, d7, d8 -- vorr d6, d6, d15 -- vorr d5, d5, d14 -- vorr d4, d4, d13 -- vorr d3, d3, d12 -- vorr d2, d2, d11 -- vorr d1, d1, d10 -- vorr d0, d0, d9 -- mov r4, r5, lsl r11 -- subs r12, r12, #1 -- lsr r4, r4, r11 -- vst1.32 {q0, q1}, [r0]! -- vst1.32 {q2, q3}, [r0] -- pld [r1, #0] -- sub r0, r0, #32 -- bne neon_b2f_copy_64_u_loop --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vldmia sp!, {q6-q8} -- vldmia sp!, {q4,q5} --#else -- vpop {q6-q8} -- vpop {q4,q5} --#endif -- ands r2, r2, #0x3f -- cmp r2, #32 -- bge neon_b2f_copy_32_u -- b neon_b2f_copy_finish_u --neon_b2f_copy_32_u: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vstmdb sp!, {q4} --#else -- vpush {q4} --#endif -- add r7, r11, #32 -- movs r12, r2, lsr #5 -- vdup.u32 q4, r7 -- and r2, r2, #0x1f --neon_b2f_copy_32_u_loop: -- sub r1, r1, #32 -- sub r0, r0, #32 -- vld1.32 {q0, q1}, [r1] -- vmov q2, q0 -- vmov q3, q1 -- vmov r5, s0 -- mov r4, r4, lsl r11 -- lsls r6, r10, #28 -- bcc neon_b2f_copy_32_u_b8 -- bpl neon_b2f_copy_32_u_b16 -- vshr.u64 q0, q0, #24 -- vshr.u64 q1, q1, #24 -- b neon_b2f_copy_32_unify --neon_b2f_copy_32_u_b8: -- vshr.u64 q0, q0, #8 -- vshr.u64 q1, q1, #8 -- b neon_b2f_copy_32_unify --neon_b2f_copy_32_u_b16: -- vshr.u64 q0, q0, #16 -- vshr.u64 q1, q1, #16 --neon_b2f_copy_32_unify: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vshl.u64 q2, q4, q2 -- vshl.u64 q3, q4, q3 --#else -- vshl.u64 q2, q2, q4 -- vshl.u64 q3, q3, q4 --#endif -- vmov s9, r4 -- vorr d3, d3, d4 -- vorr d2, d2, d7 -- vorr d1, d1, d6 -- vorr d0, d0, d5 -- mov r4, r5, lsl r11 -- subs r12, r12, #1 -- lsr r4, r4, r11 -- vst1.32 {q0, q1}, [r0] -- pld [r1, #0] -- bne neon_b2f_copy_32_u_loop --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- vldmia sp!, {q4} --#else -- vpop {q4} --#endif --neon_b2f_copy_finish_u: --neon_b2f_copy_8_u: -- movs r12, r2, lsr #0x3 -- beq neon_b2f_copy_4_u -- mov r5, r4, lsl r11 --neon_b2f_copy_8_u_loop: -- ldmdb r1!, {r6-r7} -- subs r12, r12, #1 -- orr r5, r5, r7, lsr r10 -- mov r4, r7, lsl r11 -- orr r4, r4, r6, lsr r10 -- stmdb r0!, {r4-r5} -- mov r4, r6, lsl r11 -- lsr r4, r4, r11 -- mov r5, r4, lsl r11 -- bne neon_b2f_copy_8_u_loop -- ands r2, r2, #0x7 --neon_b2f_copy_4_u: -- movs r12, r2, lsr #0x2 -- beq neon_b2f_last_bits_u -- mov r5, r4, lsl r11 --neon_b2f_copy_4_u_loop: -- ldr r6, [r1, #-4]! -- subs r12, r12, #1 -- orr r5, r5, r6, lsr r10 -- str r5, [r0, #-4]! -- mov r4, r6, lsl r11 -- lsr r4, r4, r11 -- mov r5, r4, lsl r11 -- bne neon_b2f_copy_4_u_loop -- and r2, r2, #0x3 --neon_b2f_last_bits_u: --neon_b2f_last_bits_u_loop: -- subs r10, r10, #8 -- mov r5, r4, lsr r10 -- strb r5, [r0, #-1]! -- bne neon_b2f_last_bits_u_loop --neon_b2f_copy_1_u: -- cmp r2, #0 -- beq neon_b2f_finish_u --neon_b2f_copy_1_u_loop: -- ldrb r12, [r1, #-1]! -- subs r2, r2, #1 -- strb r12, [r0, #-1]! -- bne neon_b2f_copy_1_u_loop --neon_b2f_finish_u: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- ldmia sp!, {r6-r11} --#else -- pop {r6-r11} --#endif -- --neon_b2f_finish: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- ldmia sp!, {r4-r5} --#else -- pop {r4-r5} --#endif -- --neon_memmove_done: --#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) -- ldmia sp!, {r0} --#else -- pop {r0} --#endif -- bx lr -- -- .endfunc -- .end -diff --git git/src/neon_memsets.c git/src/neon_memsets.c -deleted file mode 100755 -index 740fc1e..0000000 ---- git/src/neon_memsets.c -+++ /dev/null -@@ -1,169 +0,0 @@ --/* neon_memsets.c -- * -- * Copyright (c) 2009, Code Aurora Forum. All rights reserved. -- * -- * Redistribution and use in source and binary forms, with or without -- * modification, are permitted provided that the following conditions are met: -- * * Redistributions of source code must retain the above copyright -- * notice, this list of conditions and the following disclaimer. -- * * Redistributions in binary form must reproduce the above copyright -- * notice, this list of conditions and the following disclaimer in the -- * documentation and/or other materials provided with the distribution. -- * * Neither the name of Code Aurora nor -- * the names of its contributors may be used to endorse or promote -- * products derived from this software without specific prior written -- * permission. -- * -- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -- * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -- * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -- */ -- --#include "msm-swblits.h" -- --void memset16(uint16_t dst[], uint16_t value, int count) --{ -- if (count <= 0) -- return; -- -- asm volatile( -- " pld [%[dst], #0] \n" -- " cmp %[count], #4 \n" -- " blt 6f \n" -- " tst %[dst], #0x3 \n" -- " strneh %[value], [%[dst]], #2 \n" -- " subne %[count], %[count], #1 \n" -- " vdup.u16 q8, %[value] \n" -- " vmov q9, q8 \n" -- " cmp %[count], #64 \n" -- " bge 0f \n" -- " cmp %[count], #32 \n" -- " bge 2f \n" -- " cmp %[count], #16 \n" -- " bge 3f \n" -- " cmp %[count], #8 \n" -- " bge 4f \n" -- " b 5f \n" -- "0: \n" -- " mov r12, %[count], lsr #6 \n" -- "1: \n" -- " vst1.16 {q8, q9}, [%[dst]]! \n" -- " vst1.16 {q8, q9}, [%[dst]]! \n" -- " vst1.16 {q8, q9}, [%[dst]]! \n" -- " vst1.16 {q8, q9}, [%[dst]]! \n" -- " subs r12, r12, #1 \n" -- " bne 1b \n" -- " ands %[count], %[count], #0x3f \n" -- " beq 7f \n" -- "2: \n" -- " cmp %[count], #32 \n" -- " blt 3f \n" -- " vst1.16 {q8, q9}, [%[dst]]! \n" -- " vst1.16 {q8, q9}, [%[dst]]! \n" -- " subs %[count], %[count], #32 \n" -- " beq 7f \n" -- "3: \n" -- " cmp %[count], #16 \n" -- " blt 4f \n" -- " vst1.16 {q8, q9}, [%[dst]]! \n" -- " subs %[count], %[count], #16 \n" -- " beq 7f \n" -- "4: \n" -- " cmp %[count], #8 \n" -- " blt 5f \n" -- " vst1.16 {q8}, [%[dst]]! \n" -- " subs %[count], %[count], #8 \n" -- " beq 7f \n" -- "5: \n" -- " cmp %[count], #4 \n" -- " blt 6f \n" -- " vst1.16 {d16}, [%[dst]]! \n" -- " subs %[count], %[count], #4 \n" -- " beq 7f \n" -- "6: \n" -- " cmp %[count], #0 \n" -- " blt 7f \n" -- " lsls %[count], #31 \n" -- " strmih %[value], [%[dst]], #2 \n" -- " strcsh %[value], [%[dst]], #2 \n" -- " strcsh %[value], [%[dst]], #2 \n" -- "7: \n" -- // Clobbered input registers -- : [dst] "+r" (dst), [count] "+r" (count) -- // Unclobbered input -- : [value] "r" (value) -- // Clobbered registers -- : "q8", "q9", "r12", "cc", "memory" -- ); --} -- --void memset32(uint32_t dst[], uint32_t value, int count) --{ -- asm volatile( -- " pld [%[dst], #0] \n" -- " cmp %[count], #4 \n" -- " blt 5f \n" -- " vdup.u32 q8, %[value] \n" -- " vmov q9, q8 \n" -- " cmp %[count], #32 \n" -- " bge 0f \n" -- " cmp %[count], #16 \n" -- " bge 2f \n" -- " cmp %[count], #8 \n" -- " bge 3f \n" -- " b 4f \n" -- "0: \n" -- " mov r12, %[count], lsr #5 \n" -- "1: \n" -- " vst1.32 {q8, q9}, [%[dst]]! \n" -- " vst1.32 {q8, q9}, [%[dst]]! \n" -- " vst1.32 {q8, q9}, [%[dst]]! \n" -- " vst1.32 {q8, q9}, [%[dst]]! \n" -- " pld [%[dst], #0] \n" -- " subs r12, r12, #1 \n" -- " bne 1b \n" -- " ands %[count], %[count], #0x1f \n" -- " beq 6f \n" -- "2: \n" -- " cmp %[count], #16 \n" -- " blt 3f \n" -- " vst1.32 {q8, q9}, [%[dst]]! \n" -- " vst1.32 {q8, q9}, [%[dst]]! \n" -- " subs %[count], %[count], #16 \n" -- " beq 6f \n" -- "3: \n" -- " cmp %[count], #8 \n" -- " blt 4f \n" -- " vst1.32 {q8, q9}, [%[dst]]! \n" -- " subs %[count], %[count], #8 \n" -- " beq 6f \n" -- "4: \n" -- " cmp %[count], #4 \n" -- " blt 5f \n" -- " vst1.32 {q8}, [%[dst]]! \n" -- " subs %[count], %[count], #4 \n" -- " beq 6f \n" -- "5: \n" -- " cmp %[count], #0 \n" -- " beq 6f \n" -- " lsls %[count], #31 \n" -- " strmi %[value], [%[dst]], #4 \n" -- " strcs %[value], [%[dst]], #4 \n" -- " strcs %[value], [%[dst]], #4 \n" -- "6: @end \n" -- // Clobbered input registers -- : [dst] "+r" (dst), [count] "+r" (count) -- // Unclobbered input -- : [value] "r" (value) -- // Clobbered registers -- : "q8", "q9", "r12", "cc", "memory" -- ); --} diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch b/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch deleted file mode 100644 index 97ad380e27..0000000000 --- a/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch +++ /dev/null @@ -1,36 +0,0 @@ -commit 18515a56822fcd9c0a71240edce97ea5623b0448 -Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch> -Date: Wed Feb 10 16:29:55 2010 +0100 - - Modify Makefile.am - Removed depencies for neon - -diff --git git/src/Makefile.am git/src/Makefile.am -index 8ab1856..08da5a5 100755 ---- a/src/Makefile.am -+++ b/src/Makefile.am -@@ -12,13 +12,7 @@ MSM_DRI_SRCS += msm-drm.c msm-dri2.c - msm_drv_la_LIBADD += $(DRI2_LIBS) - endif - --NEON_CFLAGS=-march=armv7-a -mfpu=neon -mfloat-abi=softfp --NEON_CCASFLAGS=$(NEON_CFLAGS) -mthumb-interwork --NEON_ASFLAGS=-k -mcpu=cortex-a8 $(NEON_CCASFLAGS) -- --AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ $(NEON_CFLAGS) -Wall -Werror --AM_ASFLAGS = $(NEON_ASFLAGS) --AM_CCASFLAGS = $(NEON_CCASFLAGS) -+AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ -Wall -Werror - - msm_drv_la_LTLIBRARIES = msm_drv.la - msm_drv_la_LDFLAGS = -module -avoid-version -@@ -37,9 +31,6 @@ msm_drv_la_SOURCES = \ - msm-swfill.c \ - msm-hwrender.c \ - msm-pixmap.c \ -- neon_memsets.c \ -- neon_memcpy.S \ -- neon_memmove.S \ - $(MSM_DRI_SRCS) - - diff --git a/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch b/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch deleted file mode 100644 index 90dd31f605..0000000000 --- a/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch +++ /dev/null @@ -1,116 +0,0 @@ -commit cc83ba5835d5b55347fd0c0775156493b0cf3a15 -Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch> -Date: Thu Feb 11 16:26:52 2010 +0100 - - Renaming variables for getting Xorg (xf86-video-msm) work - under linux-leviathan (htcdream): - cd src - sed 's/fixed_info/fix/' -i *.h - sed 's/fixed_info/fix/' -i *.c - -diff --git git/src/msm-dri.c git/src/msm-dri.c -index a51d3bd..a74368b 100644 ---- git/src/msm-dri.c -+++ git/src/msm-dri.c -@@ -151,10 +151,10 @@ MSMDRIScreenInit(ScreenPtr pScreen) - pDRIInfo->ddxDriverMinorVersion = 0; - pDRIInfo->ddxDriverPatchVersion = 0; - -- pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fixed_info.smem_start; -+ pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fix.smem_start; - -- pDRIInfo->frameBufferSize = pMsm->fixed_info.smem_len; -- pDRIInfo->frameBufferStride = pMsm->fixed_info.line_length; -+ pDRIInfo->frameBufferSize = pMsm->fix.smem_len; -+ pDRIInfo->frameBufferStride = pMsm->fix.line_length; - - /* FIXME: How many drawables can we do (should we do)? */ - -diff --git git/src/msm-driver.c git/src/msm-driver.c -index 803197f..15378f8 100755 ---- git/src/msm-driver.c -+++ git/src/msm-driver.c -@@ -399,7 +399,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags) - - /* Get the fixed info (par) structure */ - -- if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fixed_info)) { -+ if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fix)) { - xf86DrvMsg(pScrn->scrnIndex, X_ERROR, - "Unable to read hardware info from %s: %s\n", - dev, strerror(errno)); -@@ -410,7 +410,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags) - /* Parse the ID and figure out what version of the MDP and what - * panel ID we have */ - -- if (sscanf(pMsm->fixed_info.id, "msmfb%d_%x", &mdpver, &panelid) < 2) { -+ if (sscanf(pMsm->fix.id, "msmfb%d_%x", &mdpver, &panelid) < 2) { - - xf86DrvMsg(pScrn->scrnIndex, X_ERROR, - "Unable to determine the MDP and panel type\n"); -@@ -435,7 +435,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags) - * the fbdev driver to allocate memory. In the mean time, we - * just reuse the framebuffer memory */ - -- pScrn->videoRam = pMsm->fixed_info.smem_len; -+ pScrn->videoRam = pMsm->fix.smem_len; - - /* Get the current screen setting */ - if (ioctl(pMsm->fd, FBIOGET_VSCREENINFO, &pMsm->mode_info)) { -@@ -671,8 +671,8 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags) - /* The framebuffer driver should always report the line length, - * but in case it doesn't, we can calculate it ourselves */ - -- if (pMsm->fixed_info.line_length) { -- pScrn->displayWidth = pMsm->fixed_info.line_length; -+ if (pMsm->fix.line_length) { -+ pScrn->displayWidth = pMsm->fix.line_length; - } else { - pScrn->displayWidth = pMsm->mode_info.xres_virtual * - pMsm->mode_info.bits_per_pixel / 8; -@@ -811,7 +811,7 @@ MSMCloseScreen(int scrnIndex, ScreenPtr pScreen) - #endif - - /* Unmap the framebuffer memory */ -- munmap(pMsm->fbmem, pMsm->fixed_info.smem_len); -+ munmap(pMsm->fbmem, pMsm->fix.smem_len); - - pScreen->CloseScreen = pMsm->CloseScreen; - -@@ -857,7 +857,7 @@ MSMScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv) - #endif // defined (MSMFB_GET_PAGE_PROTECTION) && defined (MSMFB_SET_PAGE_PROTECTION) - - /* Map the framebuffer memory */ -- pMsm->fbmem = mmap(NULL, pMsm->fixed_info.smem_len, -+ pMsm->fbmem = mmap(NULL, pMsm->fix.smem_len, - PROT_READ | PROT_WRITE, MAP_SHARED, pMsm->fd, 0); - - /* If we can't map the memory, then this is a short trip */ -diff --git git/src/msm-exa.c git/src/msm-exa.c -index 301923f..ce16a93 100755 ---- git/src/msm-exa.c -+++ git/src/msm-exa.c -@@ -740,8 +740,8 @@ MSMSetupExa(ScreenPtr pScreen) - pExa->flags = EXA_OFFSCREEN_PIXMAPS; - - pExa->offScreenBase = -- (pMsm->fixed_info.line_length * pMsm->mode_info.yres); -- pExa->memorySize = pMsm->fixed_info.smem_len; -+ (pMsm->fix.line_length * pMsm->mode_info.yres); -+ pExa->memorySize = pMsm->fix.smem_len; - - /* Align pixmap offsets along page boundaries */ - pExa->pixmapOffsetAlign = 4096; -diff --git git/src/msm.h git/src/msm.h -index e1e2bc7..520d390 100755 ---- git/src/msm.h -+++ git/src/msm.h -@@ -85,7 +85,7 @@ typedef struct _MSMRec - int fd; - - /* Fixed and var strutures from the framebuffer */ -- struct fb_fix_screeninfo fixed_info; -+ struct fb_fix_screeninfo fix; - struct fb_var_screeninfo mode_info; - - /* Pointer to the mapped framebuffer memory */ |