aboutsummaryrefslogtreecommitdiffstats
path: root/recipes/xorg-driver
diff options
context:
space:
mode:
authorDavid Lanzendörfer <david.lanzendoerfer@o2s.ch>2010-03-17 21:25:38 +0100
committerLukas Gorris <lukas.gorris@gmail.com>2010-03-17 21:25:38 +0100
commitd695f337bdaa0e297ad89c6fdd99edf97bc270db (patch)
tree23080a6660c4a5e76c82ef47b587e4c304bdf43b /recipes/xorg-driver
parent6c5f9d4325be253fb3cb9b6d3bec11f7a7a13562 (diff)
downloadopenembedded-d695f337bdaa0e297ad89c6fdd99edf97bc270db.tar.gz
xf86-video-msm: fix build errors
Diffstat (limited to 'recipes/xorg-driver')
-rw-r--r--recipes/xorg-driver/xf86-video-msm/no_neon.patch2901
-rw-r--r--recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch36
-rw-r--r--recipes/xorg-driver/xf86-video-msm/renaming_variables.patch116
-rw-r--r--recipes/xorg-driver/xf86-video-msm_git.bb10
4 files changed, 3061 insertions, 2 deletions
diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon.patch b/recipes/xorg-driver/xf86-video-msm/no_neon.patch
new file mode 100644
index 0000000000..c0aa92e76a
--- /dev/null
+++ b/recipes/xorg-driver/xf86-video-msm/no_neon.patch
@@ -0,0 +1,2901 @@
+commit d8910bf773fbecf7cdea359d4b530a3672e27180
+Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
+Date: Wed Feb 10 16:18:39 2010 +0100
+
+ Removed neon because its not available in our kerneÃl
+ and so its causing trubble (Illegal instruction)
+
+diff --git git/src/msm-swblits.h git/src/msm-swblits.h
+index f89f00e..a40b24b 100755
+--- git/src/msm-swblits.h
++++ git/src/msm-swblits.h
+@@ -38,16 +38,6 @@
+ #include <stdint.h>
+ #include <stdlib.h>
+
+-/* Neon intrinsics are part of the ARM or GCC compiler used. */
+-/* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */
+-#include <arm_neon.h>
+-
+-/* These are NEON-optimized functions linked to by various tests. */
+-extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes);
+-extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes);
+-extern void memset16(uint16_t *dst, uint16_t value, int count);
+-extern void memset32(uint32_t *dst, uint32_t value, int count);
+-
+ /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */
+ #define BITS_PER_BYTE (8)
+ #define BYTES_PER_16BPP_PIXEL (2)
+diff --git git/src/msm-swfill.c git/src/msm-swfill.c
+index 108fd94..3dd1ef2 100755
+--- git/src/msm-swfill.c
++++ git/src/msm-swfill.c
+@@ -212,7 +212,7 @@ memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int cou
+ }
+ }
+
+-
++/*
+ static inline void
+ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
+ {
+@@ -333,7 +333,7 @@ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
+ // Quickly fill remaining pixels (up to 7).
+ memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count);
+ }
+-
++*/
+
+ static inline void
+ memset16_Test(uint16_t *dst, uint16_t src, int count)
+@@ -368,7 +368,8 @@ memset16_Test(uint16_t *dst, uint16_t src, int count)
+
+ // Copy remaining pixels using Neon and non-Neon instructions.
+ // NOTE: This assumes that dst is aligned optimally for Neon instructions.
+- memset16_AssumesNeonAlignment((void *) dst, src, count);
++ //memset16_AssumesNeonAlignment((void *) dst, src, count);
++ memset((void *) dst, src, count);
+ }
+ }
+
+@@ -435,12 +436,14 @@ swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dp
+ if (w < 32) {
+ // For narrow rectangles, block signals only once for the entire rectangles.
+ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
++ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
++ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
+ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+ }
+ else {
+ // For wider rectangles, block and unblock signals for every row.
+- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+ }
+ }
+
+diff --git git/src/msm-swrender.c git/src/msm-swrender.c
+index a7a9abc..835dc03 100755
+--- git/src/msm-swrender.c
++++ git/src/msm-swrender.c
+@@ -214,160 +214,6 @@ swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src
+ }
+ }
+ break;
+- case 7: if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
+- return TRUE;
+- } else {
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
+- return TRUE;
+- }
+- break;
+- case 8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+- return TRUE;
+- }
+- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
+- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1;
+- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2;
+- return TRUE;
+- }
+- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
+- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
+- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
+- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
+- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1;
+- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2;
+- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3;
+- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4;
+- return TRUE;
+- }
+- else {
+- uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T);
+- uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT16_T);
+- uint16_t src3 = *(uint16_t *) (src+2*BYTES_PER_UINT16_T);
+- uint16_t src4 = *(uint16_t *) (src+3*BYTES_PER_UINT16_T);
+- uint16_t src5 = *(uint16_t *) (src+4*BYTES_PER_UINT16_T);
+- uint16_t src6 = *(uint16_t *) (src+5*BYTES_PER_UINT16_T);
+- uint16_t src7 = *(uint16_t *) (src+6*BYTES_PER_UINT16_T);
+- uint16_t src8 = *(uint16_t *) (src+7*BYTES_PER_UINT16_T);
+- *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1;
+- *(uint16_t *) (dst+1*BYTES_PER_UINT16_T) = src2;
+- *(uint16_t *) (dst+2*BYTES_PER_UINT16_T) = src3;
+- *(uint16_t *) (dst+3*BYTES_PER_UINT16_T) = src4;
+- *(uint16_t *) (dst+4*BYTES_PER_UINT16_T) = src5;
+- *(uint16_t *) (dst+5*BYTES_PER_UINT16_T) = src6;
+- *(uint16_t *) (dst+6*BYTES_PER_UINT16_T) = src7;
+- *(uint16_t *) (dst+7*BYTES_PER_UINT16_T) = src8;
+- return TRUE;
+- }
+- break;
+- case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
+- return TRUE;
+- }
+- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
+- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
+- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
+- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1;
+- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2;
+- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3;
+- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4;
+- return TRUE;
+- }
+- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
+- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
+- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
+- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
+- uint32_t src5 = *(uint32_t *) (src+4*BYTES_PER_UINT32_T);
+- uint32_t src6 = *(uint32_t *) (src+5*BYTES_PER_UINT32_T);
+- uint32_t src7 = *(uint32_t *) (src+6*BYTES_PER_UINT32_T);
+- uint32_t src8 = *(uint32_t *) (src+7*BYTES_PER_UINT32_T);
+- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1;
+- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2;
+- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3;
+- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4;
+- *(uint32_t *) (dst+4*BYTES_PER_UINT32_T) = src5;
+- *(uint32_t *) (dst+5*BYTES_PER_UINT32_T) = src6;
+- *(uint32_t *) (dst+6*BYTES_PER_UINT32_T) = src7;
+- *(uint32_t *) (dst+7*BYTES_PER_UINT32_T) = src8;
+- return TRUE;
+- }
+- else {
+- // Don't bother unrolling loops here, since that won't help for more than around 8 operations.
+- // Instead, just call multiple fixed functions.
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
+- } else {
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
+- }
+- return TRUE;
+- }
+- break;
+- case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
+- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
+- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
+- return TRUE;
+- }
+- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
+- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
+- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
+- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
+- uint64_t src5 = *(uint64_t *) (src+4*BYTES_PER_UINT64_T);
+- uint64_t src6 = *(uint64_t *) (src+5*BYTES_PER_UINT64_T);
+- uint64_t src7 = *(uint64_t *) (src+6*BYTES_PER_UINT64_T);
+- uint64_t src8 = *(uint64_t *) (src+7*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1;
+- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2;
+- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3;
+- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4;
+- *(uint64_t *) (dst+4*BYTES_PER_UINT64_T) = src5;
+- *(uint64_t *) (dst+5*BYTES_PER_UINT64_T) = src6;
+- *(uint64_t *) (dst+6*BYTES_PER_UINT64_T) = src7;
+- *(uint64_t *) (dst+7*BYTES_PER_UINT64_T) = src8;
+- return TRUE;
+- }
+- break;
+- case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
+- uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T));
+- uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T));
+- uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T));
+- uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
+- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
+- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
+- vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5);
+- vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6);
+- vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7);
+- vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8);
+- return TRUE;
+- }
+- break;
+ }
+
+ return FALSE;
+@@ -519,427 +365,6 @@ swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *sr
+ }
+ return TRUE;
+ break;
+- case 7: if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- break;
+- case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
+- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
+- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
+- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a;
+- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a;
+- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a;
+- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a;
+- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b;
+- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b;
+- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b;
+- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+- uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
+- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+- uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+- uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
+- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+- uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+- *(uint16_t *) (dst+0*dpitch+0) = src1a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a;
+- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a;
+- *(uint16_t *) (dst+1*dpitch+0) = src1b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b;
+- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b;
+- return TRUE;
+- }
+- else {
+- uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
+- uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
+- uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
+- uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
+- uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
+- uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
+- uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
+- uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
+- uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
+- uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
+- uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
+- uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
+- uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
+- uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
+- uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
+- uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
+- *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a;
+- *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a;
+- *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a;
+- *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a;
+- *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a;
+- *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a;
+- *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a;
+- *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a;
+- *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b;
+- *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b;
+- *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b;
+- *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b;
+- *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b;
+- *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b;
+- *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b;
+- *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b;
+- return TRUE;
+- }
+- break;
+- case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
+- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
+- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
+- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
+- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
+- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a;
+- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a;
+- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
+- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
+- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b;
+- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) {
+- uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
+- uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+- uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+- uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+- uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
+- uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+- uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+- uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+- *(uint32_t *) (dst+0*dpitch+0) = src1a;
+- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a;
+- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a;
+- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a;
+- *(uint32_t *) (dst+1*dpitch+0) = src1b;
+- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b;
+- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b;
+- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
+- uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
+- uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
+- uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
+- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
+- uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
+- uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
+- uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
+- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a;
+- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a;
+- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a;
+- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a;
+- *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a;
+- *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a;
+- *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a;
+- *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a;
+- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b;
+- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b;
+- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b;
+- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b;
+- *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b;
+- *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b;
+- *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b;
+- *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b;
+- return TRUE;
+- }
+- else {
+- // Don't bother unrolling loops, since that won't help for more than around 8 operations.
+- // Instead, just call multiple fixed functions.
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- break;
+- case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
+- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
+- uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T);
+- uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T);
+- uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T);
+- uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T);
+- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
+- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
+- uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T);
+- uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T);
+- uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T);
+- uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
+- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
+- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a;
+- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a;
+- *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T) = src5a;
+- *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T) = src6a;
+- *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T) = src7a;
+- *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T) = src8a;
+- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
+- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
+- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b;
+- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b;
+- *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T) = src5b;
+- *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T) = src6b;
+- *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T) = src7b;
+- *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T) = src8b;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- else {
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- break;
+- case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
+- uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T));
+- uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T));
+- uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T));
+- uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
+- uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T));
+- uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T));
+- uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T));
+- uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4_T),src6a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+6*BYTES_PER_UINT32X4_T),src7a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+7*BYTES_PER_UINT32X4_T),src8a);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+4*BYTES_PER_UINT32X4_T),src5b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+5*BYTES_PER_UINT32X4_T),src6b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+6*BYTES_PER_UINT32X4_T),src7b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+7*BYTES_PER_UINT32X4_T),src8b);
+- return TRUE;
+- }//HERE
+- else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- break;
+ }
+
+ return FALSE;
+@@ -1161,484 +586,7 @@ swCopy16BppSmallFixedWidths4Rows_Unaligned(unsigned char *dst, unsigned char *sr
+ }
+ return TRUE;
+ break;
+- case 7: if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- break;
+- // TODO: Add more alignment checks for 8 pixel-wide cases for performance reasons?
+- // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,DOUBLE_WORD_ALIGNMENT_BYTE_SIZE/2)) and related half-aligned cases...
+- case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
+- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
+- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
+- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
+- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
+- *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c;
+- *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c;
+- *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d;
+- *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
+- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a;
+- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a;
+- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a;
+- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a;
+- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b;
+- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b;
+- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b;
+- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b;
+- *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c;
+- *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c;
+- *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c;
+- *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c;
+- *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d;
+- *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d;
+- *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d;
+- *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+- uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
+- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+- uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+- uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
+- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+- uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+- uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
+- uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+- uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+- uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+- uint16_t src5c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+- uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
+- uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+- uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+- uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+- uint16_t src5d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+- *(uint16_t *) (dst+0*dpitch+0) = src1a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a;
+- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a;
+- *(uint16_t *) (dst+1*dpitch+0) = src1b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b;
+- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b;
+- *(uint16_t *) (dst+2*dpitch+0) = src1c;
+- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c;
+- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c;
+- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c;
+- *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c;
+- *(uint16_t *) (dst+3*dpitch+0) = src1d;
+- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d;
+- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d;
+- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d;
+- *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5d;
+- return TRUE;
+- }
+- else {
+- uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
+- uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
+- uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
+- uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
+- uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
+- uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
+- uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
+- uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
+- uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
+- uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
+- uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
+- uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
+- uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
+- uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
+- uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
+- uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
+- uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T);
+- uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T);
+- uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T);
+- uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T);
+- uint16_t src5c = *(uint16_t *) (src+2*spitch+4*BYTES_PER_UINT16_T);
+- uint16_t src6c = *(uint16_t *) (src+2*spitch+5*BYTES_PER_UINT16_T);
+- uint16_t src7c = *(uint16_t *) (src+2*spitch+6*BYTES_PER_UINT16_T);
+- uint16_t src8c = *(uint16_t *) (src+2*spitch+7*BYTES_PER_UINT16_T);
+- uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T);
+- uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T);
+- uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T);
+- uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T);
+- uint16_t src5d = *(uint16_t *) (src+3*spitch+4*BYTES_PER_UINT16_T);
+- uint16_t src6d = *(uint16_t *) (src+3*spitch+5*BYTES_PER_UINT16_T);
+- uint16_t src7d = *(uint16_t *) (src+3*spitch+6*BYTES_PER_UINT16_T);
+- uint16_t src8d = *(uint16_t *) (src+3*spitch+7*BYTES_PER_UINT16_T);
+- *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a;
+- *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a;
+- *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a;
+- *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a;
+- *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a;
+- *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a;
+- *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a;
+- *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a;
+- *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b;
+- *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b;
+- *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b;
+- *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b;
+- *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b;
+- *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b;
+- *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b;
+- *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b;
+- *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T) = src1c;
+- *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T) = src2c;
+- *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T) = src3c;
+- *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T) = src4c;
+- *(uint16_t *) (dst+2*dpitch+4*BYTES_PER_UINT16_T) = src5c;
+- *(uint16_t *) (dst+2*dpitch+5*BYTES_PER_UINT16_T) = src6c;
+- *(uint16_t *) (dst+2*dpitch+6*BYTES_PER_UINT16_T) = src7c;
+- *(uint16_t *) (dst+2*dpitch+7*BYTES_PER_UINT16_T) = src8c;
+- *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T) = src1d;
+- *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T) = src2d;
+- *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T) = src3d;
+- *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T) = src4d;
+- *(uint16_t *) (dst+3*dpitch+4*BYTES_PER_UINT16_T) = src5d;
+- *(uint16_t *) (dst+3*dpitch+5*BYTES_PER_UINT16_T) = src6d;
+- *(uint16_t *) (dst+3*dpitch+6*BYTES_PER_UINT16_T) = src7d;
+- *(uint16_t *) (dst+3*dpitch+7*BYTES_PER_UINT16_T) = src8d;
+- return TRUE;
+- }
+- break;
+- case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
+- vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
+- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
+- vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
+- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
+- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
+- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
+- uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src3c = *(uint64_t *) (src+2*spitch+2*BYTES_PER_UINT64_T);
+- uint64_t src4c = *(uint64_t *) (src+2*spitch+3*BYTES_PER_UINT64_T);
+- uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src3d = *(uint64_t *) (src+3*spitch+2*BYTES_PER_UINT64_T);
+- uint64_t src4d = *(uint64_t *) (src+3*spitch+3*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
+- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
+- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a;
+- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a;
+- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
+- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
+- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b;
+- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b;
+- *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c;
+- *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c;
+- *(uint64_t *) (dst+2*dpitch+2*BYTES_PER_UINT64_T) = src3c;
+- *(uint64_t *) (dst+2*dpitch+3*BYTES_PER_UINT64_T) = src4c;
+- *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d;
+- *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d;
+- *(uint64_t *) (dst+3*dpitch+2*BYTES_PER_UINT64_T) = src3d;
+- *(uint64_t *) (dst+3*dpitch+3*BYTES_PER_UINT64_T) = src4d;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
+- uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
+- uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+- uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+- uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+- uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
+- uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+- uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+- uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+- uint32_t src1c = *(uint32_t *) (src+2*spitch+0);
+- uint64_t src2c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+- uint64_t src3c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+- uint64_t src4c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+- uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+- uint32_t src1d = *(uint32_t *) (src+3*spitch+0);
+- uint64_t src2d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+- uint64_t src3d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+- uint64_t src4d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+- uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+- *(uint32_t *) (dst+0*dpitch+0) = src1a;
+- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a;
+- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a;
+- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a;
+- *(uint32_t *) (dst+1*dpitch+0) = src1b;
+- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b;
+- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b;
+- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b;
+- *(uint32_t *) (dst+2*dpitch+0) = src1c;
+- *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2c;
+- *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3c;
+- *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4c;
+- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5c;
+- *(uint32_t *) (dst+3*dpitch+0) = src1d;
+- *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2d;
+- *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3d;
+- *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4d;
+- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5d;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
+- uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
+- uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
+- uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
+- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
+- uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
+- uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
+- uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
+- uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src5c = *(uint32_t *) (src+2*spitch+4*BYTES_PER_UINT32_T);
+- uint32_t src6c = *(uint32_t *) (src+2*spitch+5*BYTES_PER_UINT32_T);
+- uint32_t src7c = *(uint32_t *) (src+2*spitch+6*BYTES_PER_UINT32_T);
+- uint32_t src8c = *(uint32_t *) (src+2*spitch+7*BYTES_PER_UINT32_T);
+- uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src5d = *(uint32_t *) (src+3*spitch+4*BYTES_PER_UINT32_T);
+- uint32_t src6d = *(uint32_t *) (src+3*spitch+5*BYTES_PER_UINT32_T);
+- uint32_t src7d = *(uint32_t *) (src+3*spitch+6*BYTES_PER_UINT32_T);
+- uint32_t src8d = *(uint32_t *) (src+3*spitch+7*BYTES_PER_UINT32_T);
+- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a;
+- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a;
+- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a;
+- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a;
+- *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a;
+- *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a;
+- *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a;
+- *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a;
+- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b;
+- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b;
+- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b;
+- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b;
+- *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b;
+- *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b;
+- *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b;
+- *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b;
+- *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c;
+- *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c;
+- *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c;
+- *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c;
+- *(uint32_t *) (dst+2*dpitch+4*BYTES_PER_UINT32_T) = src5c;
+- *(uint32_t *) (dst+2*dpitch+5*BYTES_PER_UINT32_T) = src6c;
+- *(uint32_t *) (dst+2*dpitch+6*BYTES_PER_UINT32_T) = src7c;
+- *(uint32_t *) (dst+2*dpitch+7*BYTES_PER_UINT32_T) = src8c;
+- *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d;
+- *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d;
+- *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d;
+- *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d;
+- *(uint32_t *) (dst+3*dpitch+4*BYTES_PER_UINT32_T) = src5d;
+- *(uint32_t *) (dst+3*dpitch+5*BYTES_PER_UINT32_T) = src6d;
+- *(uint32_t *) (dst+3*dpitch+6*BYTES_PER_UINT32_T) = src7d;
+- *(uint32_t *) (dst+3*dpitch+7*BYTES_PER_UINT32_T) = src8d;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+- uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
+- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+- uint32_t src6a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
+- uint32_t src7a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
+- uint32_t src8a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
+- uint16_t src9a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
+- uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
+- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+- uint32_t src6b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
+- uint32_t src7b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
+- uint32_t src8b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
+- uint16_t src9b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
+- uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
+- uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+- uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+- uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+- uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+- uint32_t src6c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
+- uint32_t src7c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
+- uint32_t src8c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
+- uint16_t src9c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
+- uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
+- uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+- uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+- uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+- uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+- uint32_t src6d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
+- uint32_t src7d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
+- uint32_t src8d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
+- uint16_t src9d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
+- *(uint16_t *) (dst+0*dpitch+0) = src1a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8a;
+- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9a;
+- *(uint16_t *) (dst+1*dpitch+0) = src1b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8b;
+- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9b;
+- *(uint16_t *) (dst+2*dpitch+0) = src1c;
+- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c;
+- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c;
+- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c;
+- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c;
+- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6c;
+- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7c;
+- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8c;
+- *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9c;
+- *(uint16_t *) (dst+3*dpitch+0) = src1d;
+- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d;
+- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d;
+- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d;
+- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5d;
+- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6d;
+- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7d;
+- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8d;
+- *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9d;
+- return TRUE;
+- }
+- else {
+- // Don't bother unrolling loops, since that won't help for more than around 8 operations.
+- // Instead, just call multiple fixed functions.
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- break;
+- // TODO: Add more alignment checks for 32 pixel-wide cases for performance reasons?
+- // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,XXX)) and related cases could make a big difference here...
+- case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3c = vld1q_u32((uint32_t *)(src+2*spitch+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4c = vld1q_u32((uint32_t *)(src+2*spitch+3*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3d = vld1q_u32((uint32_t *)(src+3*spitch+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4d = vld1q_u32((uint32_t *)(src+3*spitch+3*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
+- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
+- vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
+- vst1q_u32((uint32_t *)(dst+2*dpitch+2*BYTES_PER_UINT32X4_T),src3c);
+- vst1q_u32((uint32_t *)(dst+2*dpitch+3*BYTES_PER_UINT32X4_T),src4c);
+- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
+- vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
+- vst1q_u32((uint32_t *)(dst+3*dpitch+2*BYTES_PER_UINT32X4_T),src3d);
+- vst1q_u32((uint32_t *)(dst+3*dpitch+3*BYTES_PER_UINT32X4_T),src4d);
+- return TRUE;
+- }
+- break;
+- }
++ }
+
+ return FALSE;
+ }
+@@ -1924,10 +872,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int
+ if (rowsOverlap)
+ {
+ if (w > 64) {
+- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+ }
+ else if (w == 64) {
+- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+ }
+ else {
+ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
+@@ -1936,10 +886,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int
+ else
+ {
+ if (w > 64) {
+- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+ }
+ else if (w == 64) {
+- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+ }
+ else {
+ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
+@@ -1973,7 +925,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i
+ if (xdir >= 0 || !rowsOverlap) {
+ if (w >= 128) {
+ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+- neon_memcpy(dst, src, w);
++ //neon_memcpy(dst, src, w);
++ memcpy(dst, src, w);
+ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+ }
+ else
+@@ -1982,7 +935,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i
+ else {
+ if (w >= 128) {
+ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+- neon_memmove(dst, src, w);
++ //neon_memmove(dst, src, w);
++ memmove(dst, src, w);
+ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+ }
+ else
+@@ -2029,7 +983,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h,
+ if (xdir >= 0 || !rowsOverlap) {
+ if (w >= 42) {
+ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+- neon_memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL);
++ //neon_memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL);
++ memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL);
+ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+ }
+ else
+@@ -2038,7 +993,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h,
+ else {
+ if (w >= 42) {
+ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+- neon_memmove(dst, src, w * BYTES_PER_24BPP_PIXEL);
++ //neon_memmove(dst, src, w * BYTES_PER_24BPP_PIXEL);
++ memmove(dst, src, w * BYTES_PER_24BPP_PIXEL);
+ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+ }
+ else
+diff --git git/src/neon_memcpy.S git/src/neon_memcpy.S
+deleted file mode 100644
+index 5ecc5ce..0000000
+--- git/src/neon_memcpy.S
++++ /dev/null
+@@ -1,549 +0,0 @@
+-/***************************************************************************
+- Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+-
+- Redistribution and use in source and binary forms, with or without
+- modification, are permitted provided that the following conditions are met:
+- * Redistributions of source code must retain the above copyright
+- notice, this list of conditions and the following disclaimer.
+- * Redistributions in binary form must reproduce the above copyright
+- notice, this list of conditions and the following disclaimer in the
+- documentation and/or other materials provided with the distribution.
+- * Neither the name of Code Aurora nor the names of its contributors may
+- be used to endorse or promote products derived from this software
+- without specific prior written permission.
+-
+- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+- POSSIBILITY OF SUCH DAMAGE.
+- ***************************************************************************/
+-
+-/***************************************************************************
+- Neon memcpy: Attempts to do a memcpy with Neon registers if possible,
+- Inputs:
+- dest: The destination buffer
+- src: The source buffer
+- n: The size of the buffer to transfer
+- Outputs:
+-
+-***************************************************************************/
+-
+-/*
+- * General note:
+- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
+- * However, it looks like the 2006 CodeSourcery Assembler has issues generating
+- * the correct object code for VPOP, resulting in horrific stack crashes.
+- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
+- * and VPOP->VLDMIA. We can revert this back once we update our toolchain.
+- *
+- * Also, VSHL swaps the source register and the shift-amount register
+- * around in 2006-q3. I've coded this incorrectly so it turns out correct
+- * in the object code, but we'll need to undo that later...
+- */
+-
+- .code 32
+- .align 4
+- .globl neon_memcpy
+- .func
+-
+-neon_memcpy:
+- /*
+- * First, make sure we're not copying < 4 bytes. If so, we'll
+- * just handle it here.
+- */
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- stmdb sp!, {r0}
+-#else
+- push {r0}
+-#endif
+- cmp r2, #4
+- bgt neon_gt_4
+- /* Copy 0-4 bytes, if needed, and return.*/
+- cmp r2, #0
+-neon_smallcopy_loop:
+- beq neon_smallcopy_done
+- ldrb r12, [r1], #1
+- subs r2, r2, #1
+- strb r12, [r0], #1
+- b neon_smallcopy_loop
+-neon_smallcopy_done:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- ldmia sp!, {r0}
+-#else
+- pop {r0}
+-#endif
+- bx lr
+-
+- /* Copy 4 or more bytes*/
+-neon_gt_4:
+- /* Preload what we can...*/
+- pld [r0,#0]
+- pld [r1,#0]
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- stmdb sp!, {r4-r5}
+-#else
+- push {r4-r5}
+-#endif
+-
+-neon_check_align:
+- /* Check normal word alignment for target. */
+- ands r12, r0, #0x3
+- beq source_alignment_check
+-
+- /*
+- * Target is not aligned. Step through until we get that
+- * word-aligned. This works better than a loop, according
+- * to our pipeline modeler.
+- */
+- cmp r12, #2
+- ldrb r3, [r1], #1
+- ldrleb r4, [r1], #1
+- ldrltb r5, [r1], #1
+- rsb r12, r12, #4
+- sub r2, r2, r12
+- strb r3, [r0], #1
+- strleb r4, [r0], #1
+- strltb r5, [r0], #1
+-
+-source_alignment_check:
+- ands r12, r1, #0x3
+- bne neon_memcpy_nonaligned /* Source is not word aligned.*/
+-neon_try_16_align:
+- cmp r2, #64
+- blt neon_align_route
+- /* This is where we try 16-byte alignment. */
+- ands r12, r0, #0xf
+- beq neon_align_route
+- rsb r12, r12, #16
+-neon_16_start:
+- sub r2, r2, r12
+- lsrs r3, r12, #2
+-neon_align_16_4:
+- ldr r4, [r1], #4
+- subs r3, r3, #1
+- str r4, [r0], #4
+- bne neon_align_16_4
+-neon_align_route:
+- /* In this case, both source and target are word-aligned. */
+- cmp r2, #32768
+- bge neon_copy_128p_a
+- cmp r2, #256
+- bge neon_copy_128_a
+- cmp r2, #64
+- bge neon_copy_32_a
+- b neon_copy_finish_a
+- nop
+-neon_copy_128p_a:
+- /* We'll copy blocks 128-bytes at a time, but try to call pld to
+- * load in the next page, if possible.
+- */
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vstmdb sp!, {q4-q7}
+-#else
+- vpush {q4-q7}
+-#endif
+- mov r12, r2, lsr #7
+-neon_copy_128p_loop_a:
+- vld1.32 {q0, q1}, [r1]!
+- vld1.32 {q2, q3}, [r1]!
+- vld1.32 {q4, q5}, [r1]!
+- vld1.32 {q6, q7}, [r1]!
+- pld [r1, #0]
+- pld [r1, #1024]
+- vst1.32 {q0, q1}, [r0]!
+- vst1.32 {q2, q3}, [r0]!
+- vst1.32 {q4, q5}, [r0]!
+- vst1.32 {q6, q7}, [r0]!
+- subs r12, r12, #1
+- bne neon_copy_128p_loop_a
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vldmia sp!, {q4-q7}
+-#else
+- vpop {q4-q7}
+-#endif
+- ands r2, r2, #0x7f
+- beq neon_end
+- cmp r2, #32
+- blt neon_copy_finish_a
+- b neon_copy_32_a
+- /* Copy blocks of 128-bytes (word-aligned) at a time*/
+-neon_copy_128_a:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vstmdb sp!, {q4-q7}
+-#else
+- vpush {q4-q7}
+-#endif
+- /*
+- * Move to a 1-s based countdown to determine when to loop. That
+- * allows the subs to set the Z flag without having to explicitly
+- * call cmp to a value.
+- */
+- mov r12, r2, lsr #7
+-neon_copy_128_loop_a:
+- vld1.32 {q0, q1}, [r1]!
+- vld1.32 {q2, q3}, [r1]!
+- vld1.32 {q4, q5}, [r1]!
+- vld1.32 {q6, q7}, [r1]!
+- pld [r1, #0]
+- pld [r1, #128]
+- vst1.32 {q0, q1}, [r0]!
+- vst1.32 {q2, q3}, [r0]!
+- vst1.32 {q4, q5}, [r0]!
+- vst1.32 {q6, q7}, [r0]!
+- subs r12, r12, #1
+- pld [r0, #0]
+- pld [r0, #128]
+- bne neon_copy_128_loop_a
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vldmia sp!, {q4-q7}
+-#else
+- vpop {q4-q7}
+-#endif
+- ands r2, r2, #0x7f
+- beq neon_end
+- cmp r2, #32
+- blt neon_copy_finish_a
+- /* Copy blocks of 32-bytes (word aligned) at a time*/
+-neon_copy_32_a:
+- mov r12, r2, lsr #5
+-neon_copy_32_loop_a:
+- vld1.32 {q0,q1}, [r1]!
+- subs r12, r12, #1
+- pld [r1,#0]
+- vst1.32 {q0,q1}, [r0]!
+- bne neon_copy_32_loop_a
+- ands r2, r2, #0x1f
+- beq neon_end
+-neon_copy_finish_a:
+-neon_copy_16_a:
+- movs r12, r2, lsr #4
+- beq neon_copy_8_a
+-neon_copy_16_a_loop:
+- vld1.32 {q0}, [r1]!
+- subs r12, r12, #1
+- vst1.32 {q0}, [r0]!
+- bne neon_copy_16_a_loop
+- ands r2, r2, #0xf
+- beq neon_end
+-neon_copy_8_a:
+- cmp r2, #8
+- blt neon_copy_4_a
+- ldm r1!, {r4-r5}
+- subs r2, r2, #8
+- stm r0!, {r4-r5}
+- /* Copy 4-bytes of word-aligned data at a time*/
+-neon_copy_4_a:
+- cmp r2, #4
+- blt neon_copy_finish
+- ldr r4, [r1], #4
+- subs r2, r2, #4
+- str r4, [r0], #4
+- b neon_copy_finish
+-
+- /*
+- * Handle unaligned data. The basic concept here is that we'll
+- * try to pull out enough data from the source to get that word-
+- * aligned, then do our writes word-aligned, storing the difference
+- * in a register, and shifting the data as needed.
+- */
+-neon_memcpy_nonaligned:
+- /*
+- * If this is <8 bytes, it makes more sense to just copy it
+- * quickly instead of incurring all kinds of overhead.
+- */
+- cmp r2, #8 /* Let's try this...*/
+- ble neon_copy_finish
+- /*
+- * This is where we'll pull out either 1, 2, or 3 bytes of data
+- * from the source as needed to align it, then store off those
+- * bytes in r4. When we read in the (now) aligned data from the
+- * source, we'll shift the bytes and AND in the r4 data, then write
+- * to the target aligned.
+- *
+- * The conditional ldr calls work slightly faster than the
+- * previous method, confirmed by our pipeline modeler.
+- */
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- stmdb sp!, {r6-r9}
+-#else
+- push {r6-r9}
+-#endif
+- cmp r12, #2
+- ldrb r4, [r1], #1
+- ldrleb r5, [r1], #1
+- ldrltb r6, [r1], #1
+- rsb r8, r12, #4
+- sub r2, r2, r8
+- lsl r8, r8, #3
+- orrle r4, r4, r5, lsl #8
+- orrlt r4, r4, r6, lsl #16
+- rsb r9, r8, #32
+-
+- cmp r2, #64
+- blt neon_unaligned_route
+- ands r12, r0, #0xf
+- beq neon_unaligned_route
+- rsb r12, r12, #16
+-neon_16_start_u:
+- sub r2, r2, r12
+- lsrs r6, r12, #2
+-neon_align_16_4_u:
+- ldr r5, [r1], #4
+- subs r6, r6, #1
+- orr r4, r4, r5, lsl r8
+- str r4, [r0], #4
+- mov r4, r5, lsr r9
+- bne neon_align_16_4_u
+-neon_unaligned_route:
+- /* Decide which loop block to branch to.*/
+- cmp r2, #256
+- bge neon_copy_64_u
+- cmp r2, #64
+- bge neon_copy_32_u
+- b neon_copy_finish_u
+- /* Copy data in 64-byte blocks.*/
+-neon_copy_64_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vstmdb sp!, {q4}
+- vstmdb sp!, {q5-q8}
+-#else
+- vpush {q4}
+- vpush {q5-q8}
+-#endif
+- /* We'll need this for the q register shift later.*/
+- vdup.u32 q8, r8
+- /*
+- * As above, we determine how many times we can go through the
+- * 64-byte copy loop, then countdown.
+- */
+- mov r12, r2, lsr #6
+- and r2, r2, #0x3f
+-neon_copy_64_u_loop:
+- /* Load 64-bytes into q4-q7.*/
+- vld1.32 {q4, q5}, [r1]!
+- vld1.32 {q6, q7}, [r1]!
+- /*
+- * Shift q0-q3 right so everything but the data we need due to the
+- * alignment falls off the right-hand side. The branching
+- * is needed, since vshr requires the shift to be an immediate
+- * value.
+- */
+- lsls r5, r8, #28
+- bcc neon_copy_64_u_b8
+- bpl neon_copy_64_u_b16
+- vshr.u64 q0, q4, #40
+- vshr.u64 q1, q5, #40
+- vshr.u64 q2, q6, #40
+- vshr.u64 q3, q7, #40
+- b neon_copy_64_unify
+-neon_copy_64_u_b8:
+- vshr.u64 q0, q4, #56
+- vshr.u64 q1, q5, #56
+- vshr.u64 q2, q6, #56
+- vshr.u64 q3, q7, #56
+- b neon_copy_64_unify
+-neon_copy_64_u_b16:
+- vshr.u64 q0, q4, #48
+- vshr.u64 q1, q5, #48
+- vshr.u64 q2, q6, #48
+- vshr.u64 q3, q7, #48
+-neon_copy_64_unify:
+- /*
+- * Shift q4-q7 left by r8 bits to take the alignment into
+- * account.
+- */
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vshl.u64 q4, q8, q4
+- vshl.u64 q5, q8, q5
+- vshl.u64 q6, q8, q6
+- vshl.u64 q7, q8, q7
+-#else
+- vshl.u64 q4, q4, q8
+- vshl.u64 q5, q5, q8
+- vshl.u64 q6, q6, q8
+- vshl.u64 q7, q7, q8
+-#endif
+- /*
+- * The data in s14 will be needed for the next loop iteration. Move
+- * that to r5.
+- */
+- vmov r5, s14
+- /* We'll vorr the shifted data with the data that needs to move back.*/
+- vorr d9, d9, d0
+- /* Copy the data from the previous loop into s14.*/
+- vmov s14, r4
+- vorr d10, d10, d1
+- vorr d11, d11, d2
+- vorr d12, d12, d3
+- vorr d13, d13, d4
+- vorr d14, d14, d5
+- vorr d15, d15, d6
+- vorr d8, d8, d7
+- subs r12, r12, #1
+- pld [r1, #0]
+- pld [r1, #128]
+- /* Save off the r5 data into r4 for the next iteration.*/
+- mov r4, r5
+- vst1.32 {q4, q5}, [r0]!
+- vst1.32 {q6, q7}, [r0]!
+- pld [r0, #0]
+- pld [r0, #128]
+- bne neon_copy_64_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vldmia sp!, {q5-q8}
+- vldmia sp!, {q4}
+-#else
+- vpop {q5-q8}
+- vpop {q4}
+-#endif
+- cmp r2, #32
+- bge neon_copy_32_u
+- b neon_copy_finish_u
+-neon_copy_32_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vstmdb sp!, {q4}
+-#else
+- vpush {q4}
+-#endif
+- vdup.u32 q4, r8
+- mov r12, r2, lsr #5
+- and r2, r2, #0x1f
+-neon_copy_32_u_loop:
+- vld1.32 {q0, q1}, [r1]!
+- lsls r5, r8, #28
+- bcc neon_copy_32_u_b8
+- bpl neon_copy_32_u_b16
+- vshr.u64 q2, q0, #40
+- vshr.u64 q3, q1, #40
+- b neon_copy_32_unify
+-neon_copy_32_u_b8:
+- vshr.u64 q2, q0, #56
+- vshr.u64 q3, q1, #56
+- b neon_copy_32_unify
+-neon_copy_32_u_b16:
+- vshr.u64 q2, q0, #48
+- vshr.u64 q3, q1, #48
+-neon_copy_32_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vshl.u64 q0, q4, q0
+- vshl.u64 q1, q4, q1
+-#else
+- vshl.u64 q0, q0, q4
+- vshl.u64 q1, q1, q4
+-#endif
+- vmov r5, s14
+- vorr d1, d1, d4
+- vmov s14, r4
+- vorr d2, d2, d5
+- vorr d3, d3, d6
+- vorr d0, d0, d7
+- subs r12, r12, #1
+- pld [r1, #0]
+- mov r4, r5
+- vst1.32 {q0, q1}, [r0]!
+- bne neon_copy_32_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vldmia sp!, {q4}
+-#else
+- vpop {q4}
+-#endif
+-neon_copy_finish_u:
+-neon_copy_16_u:
+- movs r12, r2, lsr #4
+- beq neon_copy_8_u
+- vdup.u32 q2, r8
+- and r2, r2, #0xf
+-neon_copy_16_u_loop:
+- vld1.32 {q0}, [r1]!
+- lsls r5, r8, #28
+- bcc neon_copy_16_u_b8
+- bpl neon_copy_16_u_b16
+- vshr.u64 q1, q0, #40
+- b neon_copy_16_unify
+-neon_copy_16_u_b8:
+- vshr.u64 q1, q0, #56
+- b neon_copy_16_unify
+-neon_copy_16_u_b16:
+- vshr.u64 q1, q0, #48
+-neon_copy_16_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vshl.u64 q0, q2, q0
+-#else
+- vshl.u64 q0, q0, q2
+-#endif
+- vmov r5, s6
+- vorr d1, d1, d2
+- vmov s6, r4
+- vorr d0, d0, d3
+- subs r12, r12, #1
+- mov r4, r5
+- vst1.32 {q0}, [r0]!
+- bne neon_copy_16_u_loop
+-neon_copy_8_u:
+- cmp r2, #8
+- blt neon_copy_4_u
+- ldm r1!, {r6-r7}
+- subs r2, r2, #8
+- orr r4, r4, r6, lsl r8
+- mov r5, r6, lsr r9
+- orr r5, r5, r7, lsl r8
+- stm r0!, {r4-r5}
+- mov r4, r7, lsr r9
+-neon_copy_4_u:
+- cmp r2, #4
+- blt neon_copy_last_bits_u
+- ldr r5, [r1], #4
+- subs r2, r2, #4
+- orr r4, r4, r5, lsl r8
+- str r4, [r0], #4
+- mov r4, r5, lsr r9
+-neon_copy_last_bits_u:
+- /*
+- * Remember, r8 contains the size of the data in r4 in bits,
+- * so to get to bytes we'll need to shift 3 places
+- */
+- lsr r8, r8, #0x3
+- /* Write out the bytes stored in r4.*/
+-neon_copy_last_bits_u_loop:
+- strb r4, [r0], #1
+- subs r8, r8, #1
+- lsrne r4, r4, #8
+- bne neon_copy_last_bits_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- ldmia sp!, {r6-r9}
+-#else
+- pop {r6-r9}
+-#endif
+-neon_copy_finish:
+- cmp r2, #0
+- beq neon_end
+- /*
+- * This just copies the data from source to target one byte
+- * at a time. For some small values, this makes more sense.
+- * Note that since this code copies data a byte at a time,
+- * both the aligned and unaligned paths can use it.
+- */
+-neon_copy_finish_loop:
+- ldrb r4, [r1], #1
+- subs r2, r2, #1
+- strb r4, [r0], #1
+- bne neon_copy_finish_loop
+-neon_end:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- ldmia sp!, {r4-r5}
+- ldmia sp!, {r0}
+-#else
+- pop {r4-r5}
+- pop {r0}
+-#endif
+- bx lr
+-
+- .endfunc
+- .end
+diff --git git/src/neon_memmove.S git/src/neon_memmove.S
+deleted file mode 100644
+index 1bfe597..0000000
+--- git/src/neon_memmove.S
++++ /dev/null
+@@ -1,939 +0,0 @@
+-/***************************************************************************
+- Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+-
+- Redistribution and use in source and binary forms, with or without
+- modification, are permitted provided that the following conditions are met:
+- * Redistributions of source code must retain the above copyright
+- notice, this list of conditions and the following disclaimer.
+- * Redistributions in binary form must reproduce the above copyright
+- notice, this list of conditions and the following disclaimer in the
+- documentation and/or other materials provided with the distribution.
+- * Neither the name of Code Aurora nor the names of its contributors may
+- be used to endorse or promote products derived from this software
+- without specific prior written permission.
+-
+- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+- POSSIBILITY OF SUCH DAMAGE.
+- ***************************************************************************/
+-
+-/***************************************************************************
+- * Neon memmove: Attempts to do a memmove with Neon registers if possible,
+- * Inputs:
+- * dest: The destination buffer
+- * src: The source buffer
+- * n: The size of the buffer to transfer
+- * Outputs:
+- *
+- ***************************************************************************/
+-
+-/*
+- * General note:
+- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
+- * However, it looks like the 2006 CodeSourcery Assembler has issues generating
+- * the correct object code for VPOP, resulting in horrific stack crashes.
+- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
+- * and VPOP->VLDMIA. We can revert this back once we update our toolchain.
+- *
+- * Also, VSHL swaps the source register and the shift-amount register
+- * around in 2006-q3. I've coded this incorrectly so it turns out correct
+- * in the object code, but we'll need to undo that later...
+- */
+- .code 32
+- .align 4
+- .globl neon_memmove
+- .func
+-
+-neon_memmove:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- stmdb sp!, {r0}
+-#else
+- push {r0}
+-#endif
+-
+- /*
+- * The requirements for memmove state that the function should
+- * operate as if data were being copied from the source to a
+- * buffer, then to the destination. This is to allow a user
+- * to copy data from a source and target that overlap.
+- *
+- * We can't just do byte copies front-to-back automatically, since
+- * there's a good chance we may have an overlap (why else would someone
+- * intentionally use memmove then?).
+- *
+- * We'll break this into two parts. Front-to-back, or back-to-front
+- * copies.
+- */
+-neon_memmove_cmf:
+- cmp r0, r1
+- blt neon_front_to_back_copy
+- bgt neon_back_to_front_copy
+- b neon_memmove_done
+-
+- /* #############################################################
+- * Front to Back copy
+- */
+-neon_front_to_back_copy:
+- /*
+- * For small copies, just do a quick memcpy. We can do this for
+- * front-to-back copies, aligned or unaligned, since we're only
+- * doing 1 byte at a time...
+- */
+- cmp r2, #4
+- bgt neon_f2b_gt4
+- cmp r2, #0
+-neon_f2b_smallcopy_loop:
+- beq neon_memmove_done
+- ldrb r12, [r1], #1
+- subs r2, r2, #1
+- strb r12, [r0], #1
+- b neon_f2b_smallcopy_loop
+-neon_f2b_gt4:
+- /* Preload what we can...*/
+- pld [r0,#0]
+- pld [r1,#0]
+- /* The window size is in r3. */
+- sub r3, r1, r0
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- stmdb sp!, {r4-r6}
+-#else
+- push {r4-r6}
+-#endif
+-
+-neon_f2b_check_align:
+- /* Check alignment. */
+- ands r12, r0, #0x3
+- beq neon_f2b_source_align_check
+- cmp r12, #2
+- ldrb r4, [r1], #1
+- ldrleb r5, [r1], #1
+- ldrltb r6, [r1], #1
+- rsb r12, r12, #4
+- sub r2, r2, r12
+- strb r4, [r0], #1
+- strleb r5, [r0], #1
+- strltb r6, [r0], #1
+-
+-neon_f2b_source_align_check:
+- ands r12, r1, #0x3
+- bne neon_f2b_nonaligned
+-
+-neon_f2b_try_16_align:
+- /* If we're >64, attempt to align on 16-bytes. Smaller amounts
+- * don't seem to be worth handling. */
+- cmp r2, #64
+- blt neon_f2b_align_route
+- /* This is where we try 16-byte alignment. */
+- ands r12, r0, #0xf
+- beq neon_f2b_align_route
+- rsb r12, r12, #16
+-neon_f2b_16_start:
+- sub r2, r2, r12
+- lsrs r5, r12, #2
+-neon_f2b_align_16_4:
+- ldr r4, [r1], #4
+- subs r5, r5, #1
+- str r4, [r0], #4
+- bne neon_f2b_align_16_4
+-neon_f2b_align_route:
+- /* #############################################################
+- * Front to Back copy - aligned
+- */
+- /*
+- * Note that we can't just route based on the size in r2. If that's
+- * larger than the overlap window in r3, we could potentially
+- * (and likely!) destroy data we're copying.
+- */
+- cmp r2, r3
+- movle r12, r2
+- movgt r12, r3
+- cmp r12, #256
+- bge neon_f2b_copy_128_a
+- cmp r12, #64
+- bge neon_f2b_copy_32_a
+- cmp r12, #16
+- bge neon_f2b_copy_16_a
+- cmp r12, #8
+- bge neon_f2b_copy_8_a
+- cmp r12, #4
+- bge neon_f2b_copy_4_a
+- b neon_f2b_copy_1_a
+-neon_f2b_copy_128_a:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vstmdb sp!, {q4-q7}
+-#else
+- vpush {q4-q7}
+-#endif
+- mov r12, r2, lsr #7
+-neon_f2b_copy_128_a_loop:
+- vld1.32 {q0,q1}, [r1]!
+- vld1.32 {q2,q3}, [r1]!
+- vld1.32 {q4,q5}, [r1]!
+- vld1.32 {q6,q7}, [r1]!
+- pld [r1, #0]
+- pld [r1, #128]
+- vst1.32 {q0,q1}, [r0]!
+- vst1.32 {q2,q3}, [r0]!
+- vst1.32 {q4,q5}, [r0]!
+- vst1.32 {q6,q7}, [r0]!
+- subs r12, r12, #1
+- pld [r0, #0]
+- pld [r0, #128]
+- bne neon_f2b_copy_128_a_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vldmia sp!, {q4-q7}
+-#else
+- vpop {q4-q7}
+-#endif
+- ands r2, r2, #0x7f
+- beq neon_f2b_finish
+- cmp r2, #32
+- bge neon_f2b_copy_32_a
+- b neon_f2b_copy_finish_a
+-neon_f2b_copy_32_a:
+- mov r12, r2, lsr #5
+-neon_f2b_copy_32_a_loop:
+- vld1.32 {q0,q1}, [r1]!
+- subs r12, r12, #1
+- pld [r1, #0]
+- vst1.32 {q0,q1}, [r0]!
+- bne neon_f2b_copy_32_a_loop
+- ands r2, r2, #0x1f
+- beq neon_f2b_finish
+-neon_f2b_copy_finish_a:
+-neon_f2b_copy_16_a:
+- movs r12, r2, lsr #4
+- beq neon_f2b_copy_8_a
+-neon_f2b_copy_16_a_loop:
+- vld1.32 {q0}, [r1]!
+- subs r12, r12, #1
+- vst1.32 {q0}, [r0]!
+- bne neon_f2b_copy_16_a_loop
+- ands r2, r2, #0xf
+- beq neon_f2b_finish
+-neon_f2b_copy_8_a:
+- cmp r2, #8
+- blt neon_f2b_copy_4_a
+- ldm r1!, {r4-r5}
+- subs r2, r2, #8
+- stm r0!, {r4-r5}
+-neon_f2b_copy_4_a:
+- cmp r2, #4
+- blt neon_f2b_copy_1_a
+- ldr r4, [r1], #4
+- subs r2, r2, #4
+- str r4, [r0], #4
+-neon_f2b_copy_1_a:
+- cmp r2, #0
+- beq neon_f2b_finish
+-neon_f2b_copy_1_a_loop:
+- ldrb r12, [r1], #1
+- subs r2, r2, #1
+- strb r12, [r0], #1
+- bne neon_f2b_copy_1_a_loop
+- b neon_f2b_finish
+-
+- /* #############################################################
+- * Front to Back copy - unaligned
+- */
+-neon_f2b_nonaligned:
+- /*
+- * For sizes < 8, does it really make sense to do the whole shift
+- * party? Note that we DON'T want to call neon_f2b_copy_1_u,
+- * since we'll end up trying to pop r8-r11, and we DON'T want
+- * to do that...
+- */
+- cmp r2, #8
+- ble neon_f2b_copy_1_a
+-
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- stmdb sp!, {r7-r9}
+-#else
+- push {r7-r9}
+-#endif
+- cmp r12, #2
+- ldrb r4, [r1], #1
+- ldrleb r5, [r1], #1
+- ldrltb r6, [r1], #1
+- rsb r8, r12, #4
+- sub r2, r2, r8
+- lsl r8, r8, #3
+- orrle r4, r4, r5, lsl #8
+- orrlt r4, r4, r6, lsl #16
+- rsb r9, r8, #32
+- /*
+- * r4 = overflow bits
+- * r8 = # of bits we copied into the r4 register to align source.
+- * r9 = 32 - r8
+- * r12 = Index counter for each size, so we determine how many times
+- * the given size will go into r2, then count down that # of
+- * times in r12.
+- */
+- cmp r2, #64
+- blt neon_f2b_unaligned_route
+- ands r12, r0, #0xf
+- beq neon_f2b_unaligned_route
+- cmp r3, #4
+- blt neon_f2b_unaligned_route
+- rsb r12, r12, #16
+-neon_f2b_16_start_u:
+- sub r2, r2, r12
+- lsrs r6, r12, #2
+-neon_f2b_align_16_4_u:
+- ldr r5, [r1], #4
+- subs r6, r6, #1
+- orr r4, r4, r5, lsl r8
+- str r4, [r0], #4
+- mov r4, r5, lsr r9
+- bne neon_f2b_align_16_4_u
+-neon_f2b_unaligned_route:
+- cmp r2, r3
+- movle r12, r2
+- movgt r12, r3
+- cmp r12, #256
+- bge neon_f2b_copy_64_u
+- cmp r12, #64
+- bge neon_f2b_copy_32_u
+- cmp r12, #16
+- bge neon_f2b_copy_16_u
+- cmp r12, #8
+- bge neon_f2b_copy_8_u
+- cmp r12, #4
+- bge neon_f2b_copy_4_u
+- b neon_f2b_last_bits_u
+-neon_f2b_copy_64_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vstmdb sp!, {q4}
+- vstmdb sp!, {q5-q8}
+-#else
+- vpush {q4}
+- vpush {q5-q8}
+-#endif
+- vdup.u32 q8, r8
+- mov r12, r2, lsr #6
+- and r2, r2, #0x3f
+-neon_f2b_copy_64_u_loop:
+- vld1.32 {q4, q5}, [r1]!
+- vld1.32 {q6, q7}, [r1]!
+- lsls r5, r8, #28
+- bcc neon_f2b_copy_64_u_b8
+- bpl neon_f2b_copy_64_u_b16
+- vshr.u64 q0, q4, #40
+- vshr.u64 q1, q5, #40
+- vshr.u64 q2, q6, #40
+- vshr.u64 q3, q7, #40
+- b neon_f2b_copy_64_unify
+-neon_f2b_copy_64_u_b8:
+- vshr.u64 q0, q4, #56
+- vshr.u64 q1, q5, #56
+- vshr.u64 q2, q6, #56
+- vshr.u64 q3, q7, #56
+- b neon_f2b_copy_64_unify
+-neon_f2b_copy_64_u_b16:
+- vshr.u64 q0, q4, #48
+- vshr.u64 q1, q5, #48
+- vshr.u64 q2, q6, #48
+- vshr.u64 q3, q7, #48
+-neon_f2b_copy_64_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vshl.u64 q4, q8, q4
+- vshl.u64 q5, q8, q5
+- vshl.u64 q6, q8, q6
+- vshl.u64 q7, q8, q7
+-#else
+- vshl.u64 q4, q4, q8
+- vshl.u64 q5, q5, q8
+- vshl.u64 q6, q6, q8
+- vshl.u64 q7, q7, q8
+-#endif
+- vmov r5, s14
+- vorr d9, d9, d0
+- vmov s14, r4
+- vorr d10, d10, d1
+- vorr d11, d11, d2
+- vorr d12, d12, d3
+- vorr d13, d13, d4
+- vorr d14, d14, d5
+- vorr d15, d15, d6
+- vorr d8, d8, d7
+- subs r12, r12, #1
+- pld [r1, #0]
+- pld [r1, #128]
+- mov r4, r5
+- vst1.32 {q4, q5}, [r0]!
+- vst1.32 {q6, q7}, [r0]!
+- pld [r0, #0]
+- pld [r0, #128]
+- bne neon_f2b_copy_64_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vldmia sp!, {q5-q8}
+- vldmia sp!, {q4}
+-#else
+- vpop {q5-q8}
+- vpop {q4}
+-#endif
+- cmp r2, #32
+- bge neon_f2b_copy_32_u
+- b neon_f2b_copy_finish_u
+-neon_f2b_copy_32_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vstmdb sp!, {q4}
+-#else
+- vpush {q4}
+-#endif
+- vdup.u32 q4, r8
+- mov r12, r2, lsr #5
+- and r2, r2, #0x1f
+-neon_f2b_copy_32_u_loop:
+- vld1.32 {q0, q1}, [r1]!
+- lsls r5, r8, #28
+- bcc neon_f2b_copy_32_u_b8
+- bpl neon_f2b_copy_32_u_b16
+- vshr.u64 q2, q0, #40
+- vshr.u64 q3, q1, #40
+- b neon_f2b_copy_32_unify
+-neon_f2b_copy_32_u_b8:
+- vshr.u64 q2, q0, #56
+- vshr.u64 q3, q1, #56
+- b neon_f2b_copy_32_unify
+-neon_f2b_copy_32_u_b16:
+- vshr.u64 q2, q0, #48
+- vshr.u64 q3, q1, #48
+-neon_f2b_copy_32_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vshl.u64 q0, q4, q0
+- vshl.u64 q1, q4, q1
+-#else
+- vshl.u64 q0, q0, q4
+- vshl.u64 q1, q1, q4
+-#endif
+- vmov r5, s14
+- vorr d1, d1, d4
+- vmov s14, r4
+- vorr d2, d2, d5
+- vorr d3, d3, d6
+- vorr d0, d0, d7
+- subs r12, r12, #1
+- pld [r1, #0]
+- mov r4, r5
+- vst1.32 {q0, q1}, [r0]!
+- bne neon_f2b_copy_32_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vldmia sp!, {q4}
+-#else
+- vpop {q4}
+-#endif
+-neon_f2b_copy_finish_u:
+-neon_f2b_copy_16_u:
+- movs r12, r2, lsr #4
+- beq neon_f2b_copy_8_u
+- vdup.u32 q2, r8
+- and r2, r2, #0xf
+-neon_f2b_copy_16_u_loop:
+- vld1.32 {q0}, [r1]!
+- lsls r5, r8, #28
+- bcc neon_f2b_copy_16_u_b8
+- bpl neon_f2b_copy_16_u_b16
+- vshr.u64 q1, q0, #40
+- b neon_f2b_copy_16_unify
+-neon_f2b_copy_16_u_b8:
+- vshr.u64 q1, q0, #56
+- b neon_f2b_copy_16_unify
+-neon_f2b_copy_16_u_b16:
+- vshr.u64 q1, q0, #48
+-neon_f2b_copy_16_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vshl.u64 q0, q2, q0
+-#else
+- vshl.u64 q0, q0, q2
+-#endif
+- vmov r5, s6
+- vorr d1, d1, d2
+- vmov s6, r4
+- vorr d0, d0, d3
+- subs r12, r12, #1
+- mov r4, r5
+- vst1.32 {q0}, [r0]!
+- bne neon_f2b_copy_16_u_loop
+-neon_f2b_copy_8_u:
+- cmp r2, #8
+- blt neon_f2b_copy_4_u
+- ldm r1!, {r6-r7}
+- subs r2, r2, #8
+- orr r4, r4, r6, lsl r8
+- mov r5, r6, lsr r9
+- orr r5, r5, r7, lsl r8
+- stm r0!, {r4-r5}
+- mov r4, r7, lsr r9
+-neon_f2b_copy_4_u:
+- cmp r2, #4
+- blt neon_f2b_last_bits_u
+- ldr r5, [r1], #4
+- subs r2, r2, #4
+- orr r4, r4, r5, lsl r8
+- str r4, [r0], #4
+- mov r4, r5, lsr r9
+-neon_f2b_last_bits_u:
+- lsr r8, r8, #0x3
+-neon_f2b_last_bits_u_loop:
+- strb r4, [r0], #1
+- subs r8, r8, #1
+- lsr r4, r4, #8
+- bne neon_f2b_last_bits_u_loop
+-neon_f2b_copy_1_u:
+- cmp r2, #0
+- beq neon_f2b_finish_u
+-neon_f2b_copy_1_u_loop:
+- ldrb r12, [r1], #1
+- subs r2, r2, #1
+- strb r12, [r0], #1
+- bne neon_f2b_copy_1_u_loop
+-neon_f2b_finish_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- ldmia sp!, {r7-r9}
+-#else
+- pop {r7-r9}
+-#endif
+- /* #############################################################
+- * Front to Back copy - finish
+- */
+-neon_f2b_finish:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- ldmia sp!, {r4-r6}
+-#else
+- pop {r4-r6}
+-#endif
+- b neon_memmove_done
+-
+- /* #############################################################
+- * Back to Front copy
+- */
+-neon_back_to_front_copy:
+- /*
+- * Here, we'll want to shift to the end of the buffers. This
+- * actually points us one past where we need to go, but since
+- * we'll pre-decrement throughout, this will be fine.
+- */
+- add r0, r0, r2
+- add r1, r1, r2
+- cmp r2, #4
+- bgt neon_b2f_gt4
+- cmp r2, #0
+-neon_b2f_smallcopy_loop:
+- beq neon_memmove_done
+- ldrb r12, [r1, #-1]!
+- subs r2, r2, #1
+- strb r12, [r0, #-1]!
+- b neon_b2f_smallcopy_loop
+-neon_b2f_gt4:
+- pld [r0, #0]
+- pld [r1, #0]
+- /*
+- * The minimum of the overlap window size and the copy size
+- * is in r3.
+- */
+- sub r3, r0, r1
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- stmdb sp!, {r4-r5}
+-#else
+- push {r4-r5}
+-#endif
+-
+- /*
+- * Check alignment. Since we'll pre-decrement as we step thru, we'll
+- * need to make sure we're on word-alignment.
+- */
+-neon_b2f_check_align:
+- ands r12, r0, #0x3
+- beq neon_b2f_source_align_check
+- sub r2, r2, r12
+-neon_b2f_shift_align:
+- ldrb r4, [r1, #-1]!
+- subs r12, r12, #1
+- strb r4, [r0, #-1]!
+- bne neon_b2f_shift_align
+-neon_b2f_source_align_check:
+- ands r4, r1, #0x3
+- bne neon_b2f_nonaligned
+-
+-neon_b2f_try_16_align:
+- /* If we're >64, attempt to align on 16-bytes. Smaller amounts
+- * don't seem to be worth handling. */
+- cmp r2, #64
+- blt neon_b2f_align_route
+- ands r12, r0, #0xf
+- beq neon_b2f_align_route
+- /* In this case, r12 has the number of bytes to roll backward. */
+-neon_b2f_16_start:
+- sub r2, r2, r12
+- lsrs r5, r12, #2
+-neon_b2f_align_16_4:
+- ldr r4, [r1, #-4]!
+- subs r5, r5, #1
+- str r4, [r0, #-4]!
+- bne neon_b2f_align_16_4
+-neon_b2f_align_route:
+- /*
+- * #############################################################
+- * Back to Front copy - aligned
+- */
+- cmp r2, r3
+- movle r12, r2
+- movgt r12, r3
+- cmp r12, #256
+- bge neon_b2f_copy_128_a
+- cmp r12, #64
+- bge neon_b2f_copy_32_a
+- cmp r12, #8
+- bge neon_b2f_copy_8_a
+- cmp r12, #4
+- bge neon_b2f_copy_4_a
+- b neon_b2f_copy_1_a
+-neon_b2f_copy_128_a:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vstmdb sp!, {q4-q7}
+-#else
+- vpush {q4-q7}
+-#endif
+- movs r12, r2, lsr #7
+- /*
+- * This irks me. There MUST be a better way to read these in and
+- * scan the register backward instead of making it go forward. Then
+- * we need to do two subtractions...
+- */
+-neon_b2f_copy_128_a_loop:
+- sub r1, r1, #128
+- sub r0, r0, #128
+- vld1.32 {q0, q1}, [r1]!
+- vld1.32 {q2, q3}, [r1]!
+- vld1.32 {q4, q5}, [r1]!
+- vld1.32 {q6, q7}, [r1]!
+- pld [r1, #-128]
+- pld [r1, #-256]
+- vst1.32 {q0, q1}, [r0]!
+- vst1.32 {q2, q3}, [r0]!
+- vst1.32 {q4, q5}, [r0]!
+- vst1.32 {q6, q7}, [r0]!
+- subs r12, r12, #1
+- pld [r0, #-128]
+- pld [r0, #-256]
+- sub r1, r1, #128
+- sub r0, r0, #128
+- bne neon_b2f_copy_128_a_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vldmia sp!, {q4-q7}
+-#else
+- vpop {q4-q7}
+-#endif
+- ands r2, r2, #0x7f
+- beq neon_b2f_finish
+- cmp r2, #32
+- bge neon_b2f_copy_32_a
+- b neon_b2f_copy_finish_a
+-neon_b2f_copy_32_a:
+- mov r12, r2, lsr #5
+-neon_b2f_copy_32_a_loop:
+- sub r1, r1, #32
+- sub r0, r0, #32
+- vld1.32 {q0,q1}, [r1]
+- subs r12, r12, #1
+- vst1.32 {q0,q1}, [r0]
+- pld [r1, #0]
+- bne neon_b2f_copy_32_a_loop
+- ands r2, r2, #0x1f
+- beq neon_b2f_finish
+-neon_b2f_copy_finish_a:
+-neon_b2f_copy_8_a:
+- movs r12, r2, lsr #0x3
+- beq neon_b2f_copy_4_a
+-neon_b2f_copy_8_a_loop:
+- ldmdb r1!, {r4-r5}
+- subs r12, r12, #1
+- stmdb r0!, {r4-r5}
+- bne neon_b2f_copy_8_a_loop
+- and r2, r2, #0x7
+-neon_b2f_copy_4_a:
+- movs r12, r2, lsr #0x2
+- beq neon_b2f_copy_1_a
+- and r2, r2, #0x3
+-neon_b2f_copy_4_a_loop:
+- ldr r4, [r1, #-4]!
+- subs r12, r12, #1
+- str r4, [r0, #-4]!
+- bne neon_b2f_copy_4_a_loop
+-neon_b2f_copy_1_a:
+- cmp r2, #0
+- beq neon_b2f_finish
+-neon_b2f_copy_1_a_loop:
+- ldrb r12, [r1, #-1]!
+- subs r2, r2, #1
+- strb r12, [r0, #-1]!
+- bne neon_b2f_copy_1_a_loop
+-
+- /* #############################################################
+- * Back to Front copy - unaligned
+- */
+-neon_b2f_nonaligned:
+- /*
+- * For sizes < 8, does it really make sense to do the whole shift
+- * party?
+- */
+- cmp r2, #8
+- ble neon_b2f_copy_1_a
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- stmdb sp!, {r6-r11}
+-#else
+- push {r6-r11}
+-#endif
+- /*
+- * r3 = max window size
+- * r4 = overflow bytes
+- * r5 = bytes we're reading into
+- * r6 = # bytes we're off.
+- * r10 = copy of r6
+- */
+- and r6, r1, #0x3
+- eor r4, r4, r4
+- mov r10, r6
+-neon_b2f_realign:
+- ldrb r5, [r1, #-1]!
+- subs r6, r6, #1
+- orr r4, r5, r4, lsl #8
+- bne neon_b2f_realign
+- /*
+- * r10 = # of bits we copied into the r4 register to align source.
+- * r11 = 32 - r10
+- * r12 = Index counter for each size, so we determine how many times
+- * the given size will go into r2, then count down that # of
+- * times in r12.
+- */
+- sub r2, r2, r10
+- lsl r10, r10, #0x3
+- rsb r11, r10, #32
+-
+- cmp r2, r3
+- movle r12, r2
+- movgt r12, r3
+- cmp r12, #256
+- bge neon_b2f_copy_64_u
+- cmp r12, #64
+- bge neon_b2f_copy_32_u
+- cmp r12, #8
+- bge neon_b2f_copy_8_u
+- cmp r12, #4
+- bge neon_b2f_copy_4_u
+- b neon_b2f_last_bits_u
+-neon_b2f_copy_64_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vstmdb sp!, {q4,q5}
+- vstmdb sp!, {q6-q8}
+-#else
+- vpush {q4,q5}
+- vpush {q6-q8}
+-#endif
+- add r7, r11, #32
+- movs r12, r2, lsr #6
+- vdup.u32 q8, r7
+-neon_b2f_copy_64_u_loop:
+- sub r1, r1, #64
+- sub r0, r0, #64
+- vld1.32 {q0, q1}, [r1]!
+- vld1.32 {q2, q3}, [r1]
+- sub r1, r1, #32
+- vmov q4, q0
+- vmov q5, q1
+- vmov q6, q2
+- vmov q7, q3
+- vmov r5, s0
+- mov r4, r4, lsl r11
+- lsls r6, r10, #28
+- bcc neon_b2f_copy_64_u_b8
+- bpl neon_b2f_copy_64_u_b16
+- vshr.u64 q0, q0, #24
+- vshr.u64 q1, q1, #24
+- vshr.u64 q2, q2, #24
+- vshr.u64 q3, q3, #24
+- b neon_b2f_copy_64_unify
+-neon_b2f_copy_64_u_b8:
+- vshr.u64 q0, q0, #8
+- vshr.u64 q1, q1, #8
+- vshr.u64 q2, q2, #8
+- vshr.u64 q3, q3, #8
+- b neon_b2f_copy_64_unify
+-neon_b2f_copy_64_u_b16:
+- vshr.u64 q0, q0, #16
+- vshr.u64 q1, q1, #16
+- vshr.u64 q2, q2, #16
+- vshr.u64 q3, q3, #16
+-neon_b2f_copy_64_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vshl.u64 q4, q8, q4
+- vshl.u64 q5, q8, q5
+- vshl.u64 q6, q8, q6
+- vshl.u64 q7, q8, q7
+-#else
+- vshl.u64 q4, q4, q8
+- vshl.u64 q5, q5, q8
+- vshl.u64 q6, q6, q8
+- vshl.u64 q7, q7, q8
+-#endif
+- vmov s17, r4
+- vorr d7, d7, d8
+- vorr d6, d6, d15
+- vorr d5, d5, d14
+- vorr d4, d4, d13
+- vorr d3, d3, d12
+- vorr d2, d2, d11
+- vorr d1, d1, d10
+- vorr d0, d0, d9
+- mov r4, r5, lsl r11
+- subs r12, r12, #1
+- lsr r4, r4, r11
+- vst1.32 {q0, q1}, [r0]!
+- vst1.32 {q2, q3}, [r0]
+- pld [r1, #0]
+- sub r0, r0, #32
+- bne neon_b2f_copy_64_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vldmia sp!, {q6-q8}
+- vldmia sp!, {q4,q5}
+-#else
+- vpop {q6-q8}
+- vpop {q4,q5}
+-#endif
+- ands r2, r2, #0x3f
+- cmp r2, #32
+- bge neon_b2f_copy_32_u
+- b neon_b2f_copy_finish_u
+-neon_b2f_copy_32_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vstmdb sp!, {q4}
+-#else
+- vpush {q4}
+-#endif
+- add r7, r11, #32
+- movs r12, r2, lsr #5
+- vdup.u32 q4, r7
+- and r2, r2, #0x1f
+-neon_b2f_copy_32_u_loop:
+- sub r1, r1, #32
+- sub r0, r0, #32
+- vld1.32 {q0, q1}, [r1]
+- vmov q2, q0
+- vmov q3, q1
+- vmov r5, s0
+- mov r4, r4, lsl r11
+- lsls r6, r10, #28
+- bcc neon_b2f_copy_32_u_b8
+- bpl neon_b2f_copy_32_u_b16
+- vshr.u64 q0, q0, #24
+- vshr.u64 q1, q1, #24
+- b neon_b2f_copy_32_unify
+-neon_b2f_copy_32_u_b8:
+- vshr.u64 q0, q0, #8
+- vshr.u64 q1, q1, #8
+- b neon_b2f_copy_32_unify
+-neon_b2f_copy_32_u_b16:
+- vshr.u64 q0, q0, #16
+- vshr.u64 q1, q1, #16
+-neon_b2f_copy_32_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vshl.u64 q2, q4, q2
+- vshl.u64 q3, q4, q3
+-#else
+- vshl.u64 q2, q2, q4
+- vshl.u64 q3, q3, q4
+-#endif
+- vmov s9, r4
+- vorr d3, d3, d4
+- vorr d2, d2, d7
+- vorr d1, d1, d6
+- vorr d0, d0, d5
+- mov r4, r5, lsl r11
+- subs r12, r12, #1
+- lsr r4, r4, r11
+- vst1.32 {q0, q1}, [r0]
+- pld [r1, #0]
+- bne neon_b2f_copy_32_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- vldmia sp!, {q4}
+-#else
+- vpop {q4}
+-#endif
+-neon_b2f_copy_finish_u:
+-neon_b2f_copy_8_u:
+- movs r12, r2, lsr #0x3
+- beq neon_b2f_copy_4_u
+- mov r5, r4, lsl r11
+-neon_b2f_copy_8_u_loop:
+- ldmdb r1!, {r6-r7}
+- subs r12, r12, #1
+- orr r5, r5, r7, lsr r10
+- mov r4, r7, lsl r11
+- orr r4, r4, r6, lsr r10
+- stmdb r0!, {r4-r5}
+- mov r4, r6, lsl r11
+- lsr r4, r4, r11
+- mov r5, r4, lsl r11
+- bne neon_b2f_copy_8_u_loop
+- ands r2, r2, #0x7
+-neon_b2f_copy_4_u:
+- movs r12, r2, lsr #0x2
+- beq neon_b2f_last_bits_u
+- mov r5, r4, lsl r11
+-neon_b2f_copy_4_u_loop:
+- ldr r6, [r1, #-4]!
+- subs r12, r12, #1
+- orr r5, r5, r6, lsr r10
+- str r5, [r0, #-4]!
+- mov r4, r6, lsl r11
+- lsr r4, r4, r11
+- mov r5, r4, lsl r11
+- bne neon_b2f_copy_4_u_loop
+- and r2, r2, #0x3
+-neon_b2f_last_bits_u:
+-neon_b2f_last_bits_u_loop:
+- subs r10, r10, #8
+- mov r5, r4, lsr r10
+- strb r5, [r0, #-1]!
+- bne neon_b2f_last_bits_u_loop
+-neon_b2f_copy_1_u:
+- cmp r2, #0
+- beq neon_b2f_finish_u
+-neon_b2f_copy_1_u_loop:
+- ldrb r12, [r1, #-1]!
+- subs r2, r2, #1
+- strb r12, [r0, #-1]!
+- bne neon_b2f_copy_1_u_loop
+-neon_b2f_finish_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- ldmia sp!, {r6-r11}
+-#else
+- pop {r6-r11}
+-#endif
+-
+-neon_b2f_finish:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- ldmia sp!, {r4-r5}
+-#else
+- pop {r4-r5}
+-#endif
+-
+-neon_memmove_done:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+- ldmia sp!, {r0}
+-#else
+- pop {r0}
+-#endif
+- bx lr
+-
+- .endfunc
+- .end
+diff --git git/src/neon_memsets.c git/src/neon_memsets.c
+deleted file mode 100755
+index 740fc1e..0000000
+--- git/src/neon_memsets.c
++++ /dev/null
+@@ -1,169 +0,0 @@
+-/* neon_memsets.c
+- *
+- * Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+- *
+- * Redistribution and use in source and binary forms, with or without
+- * modification, are permitted provided that the following conditions are met:
+- * * Redistributions of source code must retain the above copyright
+- * notice, this list of conditions and the following disclaimer.
+- * * Redistributions in binary form must reproduce the above copyright
+- * notice, this list of conditions and the following disclaimer in the
+- * documentation and/or other materials provided with the distribution.
+- * * Neither the name of Code Aurora nor
+- * the names of its contributors may be used to endorse or promote
+- * products derived from this software without specific prior written
+- * permission.
+- *
+- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+- * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+- * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+- */
+-
+-#include "msm-swblits.h"
+-
+-void memset16(uint16_t dst[], uint16_t value, int count)
+-{
+- if (count <= 0)
+- return;
+-
+- asm volatile(
+- " pld [%[dst], #0] \n"
+- " cmp %[count], #4 \n"
+- " blt 6f \n"
+- " tst %[dst], #0x3 \n"
+- " strneh %[value], [%[dst]], #2 \n"
+- " subne %[count], %[count], #1 \n"
+- " vdup.u16 q8, %[value] \n"
+- " vmov q9, q8 \n"
+- " cmp %[count], #64 \n"
+- " bge 0f \n"
+- " cmp %[count], #32 \n"
+- " bge 2f \n"
+- " cmp %[count], #16 \n"
+- " bge 3f \n"
+- " cmp %[count], #8 \n"
+- " bge 4f \n"
+- " b 5f \n"
+- "0: \n"
+- " mov r12, %[count], lsr #6 \n"
+- "1: \n"
+- " vst1.16 {q8, q9}, [%[dst]]! \n"
+- " vst1.16 {q8, q9}, [%[dst]]! \n"
+- " vst1.16 {q8, q9}, [%[dst]]! \n"
+- " vst1.16 {q8, q9}, [%[dst]]! \n"
+- " subs r12, r12, #1 \n"
+- " bne 1b \n"
+- " ands %[count], %[count], #0x3f \n"
+- " beq 7f \n"
+- "2: \n"
+- " cmp %[count], #32 \n"
+- " blt 3f \n"
+- " vst1.16 {q8, q9}, [%[dst]]! \n"
+- " vst1.16 {q8, q9}, [%[dst]]! \n"
+- " subs %[count], %[count], #32 \n"
+- " beq 7f \n"
+- "3: \n"
+- " cmp %[count], #16 \n"
+- " blt 4f \n"
+- " vst1.16 {q8, q9}, [%[dst]]! \n"
+- " subs %[count], %[count], #16 \n"
+- " beq 7f \n"
+- "4: \n"
+- " cmp %[count], #8 \n"
+- " blt 5f \n"
+- " vst1.16 {q8}, [%[dst]]! \n"
+- " subs %[count], %[count], #8 \n"
+- " beq 7f \n"
+- "5: \n"
+- " cmp %[count], #4 \n"
+- " blt 6f \n"
+- " vst1.16 {d16}, [%[dst]]! \n"
+- " subs %[count], %[count], #4 \n"
+- " beq 7f \n"
+- "6: \n"
+- " cmp %[count], #0 \n"
+- " blt 7f \n"
+- " lsls %[count], #31 \n"
+- " strmih %[value], [%[dst]], #2 \n"
+- " strcsh %[value], [%[dst]], #2 \n"
+- " strcsh %[value], [%[dst]], #2 \n"
+- "7: \n"
+- // Clobbered input registers
+- : [dst] "+r" (dst), [count] "+r" (count)
+- // Unclobbered input
+- : [value] "r" (value)
+- // Clobbered registers
+- : "q8", "q9", "r12", "cc", "memory"
+- );
+-}
+-
+-void memset32(uint32_t dst[], uint32_t value, int count)
+-{
+- asm volatile(
+- " pld [%[dst], #0] \n"
+- " cmp %[count], #4 \n"
+- " blt 5f \n"
+- " vdup.u32 q8, %[value] \n"
+- " vmov q9, q8 \n"
+- " cmp %[count], #32 \n"
+- " bge 0f \n"
+- " cmp %[count], #16 \n"
+- " bge 2f \n"
+- " cmp %[count], #8 \n"
+- " bge 3f \n"
+- " b 4f \n"
+- "0: \n"
+- " mov r12, %[count], lsr #5 \n"
+- "1: \n"
+- " vst1.32 {q8, q9}, [%[dst]]! \n"
+- " vst1.32 {q8, q9}, [%[dst]]! \n"
+- " vst1.32 {q8, q9}, [%[dst]]! \n"
+- " vst1.32 {q8, q9}, [%[dst]]! \n"
+- " pld [%[dst], #0] \n"
+- " subs r12, r12, #1 \n"
+- " bne 1b \n"
+- " ands %[count], %[count], #0x1f \n"
+- " beq 6f \n"
+- "2: \n"
+- " cmp %[count], #16 \n"
+- " blt 3f \n"
+- " vst1.32 {q8, q9}, [%[dst]]! \n"
+- " vst1.32 {q8, q9}, [%[dst]]! \n"
+- " subs %[count], %[count], #16 \n"
+- " beq 6f \n"
+- "3: \n"
+- " cmp %[count], #8 \n"
+- " blt 4f \n"
+- " vst1.32 {q8, q9}, [%[dst]]! \n"
+- " subs %[count], %[count], #8 \n"
+- " beq 6f \n"
+- "4: \n"
+- " cmp %[count], #4 \n"
+- " blt 5f \n"
+- " vst1.32 {q8}, [%[dst]]! \n"
+- " subs %[count], %[count], #4 \n"
+- " beq 6f \n"
+- "5: \n"
+- " cmp %[count], #0 \n"
+- " beq 6f \n"
+- " lsls %[count], #31 \n"
+- " strmi %[value], [%[dst]], #4 \n"
+- " strcs %[value], [%[dst]], #4 \n"
+- " strcs %[value], [%[dst]], #4 \n"
+- "6: @end \n"
+- // Clobbered input registers
+- : [dst] "+r" (dst), [count] "+r" (count)
+- // Unclobbered input
+- : [value] "r" (value)
+- // Clobbered registers
+- : "q8", "q9", "r12", "cc", "memory"
+- );
+-}
diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch b/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch
new file mode 100644
index 0000000000..97ad380e27
--- /dev/null
+++ b/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch
@@ -0,0 +1,36 @@
+commit 18515a56822fcd9c0a71240edce97ea5623b0448
+Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
+Date: Wed Feb 10 16:29:55 2010 +0100
+
+ Modify Makefile.am
+ Removed depencies for neon
+
+diff --git git/src/Makefile.am git/src/Makefile.am
+index 8ab1856..08da5a5 100755
+--- a/src/Makefile.am
++++ b/src/Makefile.am
+@@ -12,13 +12,7 @@ MSM_DRI_SRCS += msm-drm.c msm-dri2.c
+ msm_drv_la_LIBADD += $(DRI2_LIBS)
+ endif
+
+-NEON_CFLAGS=-march=armv7-a -mfpu=neon -mfloat-abi=softfp
+-NEON_CCASFLAGS=$(NEON_CFLAGS) -mthumb-interwork
+-NEON_ASFLAGS=-k -mcpu=cortex-a8 $(NEON_CCASFLAGS)
+-
+-AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ $(NEON_CFLAGS) -Wall -Werror
+-AM_ASFLAGS = $(NEON_ASFLAGS)
+-AM_CCASFLAGS = $(NEON_CCASFLAGS)
++AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ -Wall -Werror
+
+ msm_drv_la_LTLIBRARIES = msm_drv.la
+ msm_drv_la_LDFLAGS = -module -avoid-version
+@@ -37,9 +31,6 @@ msm_drv_la_SOURCES = \
+ msm-swfill.c \
+ msm-hwrender.c \
+ msm-pixmap.c \
+- neon_memsets.c \
+- neon_memcpy.S \
+- neon_memmove.S \
+ $(MSM_DRI_SRCS)
+
+
diff --git a/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch b/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch
new file mode 100644
index 0000000000..90dd31f605
--- /dev/null
+++ b/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch
@@ -0,0 +1,116 @@
+commit cc83ba5835d5b55347fd0c0775156493b0cf3a15
+Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
+Date: Thu Feb 11 16:26:52 2010 +0100
+
+ Renaming variables for getting Xorg (xf86-video-msm) work
+ under linux-leviathan (htcdream):
+ cd src
+ sed 's/fixed_info/fix/' -i *.h
+ sed 's/fixed_info/fix/' -i *.c
+
+diff --git git/src/msm-dri.c git/src/msm-dri.c
+index a51d3bd..a74368b 100644
+--- git/src/msm-dri.c
++++ git/src/msm-dri.c
+@@ -151,10 +151,10 @@ MSMDRIScreenInit(ScreenPtr pScreen)
+ pDRIInfo->ddxDriverMinorVersion = 0;
+ pDRIInfo->ddxDriverPatchVersion = 0;
+
+- pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fixed_info.smem_start;
++ pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fix.smem_start;
+
+- pDRIInfo->frameBufferSize = pMsm->fixed_info.smem_len;
+- pDRIInfo->frameBufferStride = pMsm->fixed_info.line_length;
++ pDRIInfo->frameBufferSize = pMsm->fix.smem_len;
++ pDRIInfo->frameBufferStride = pMsm->fix.line_length;
+
+ /* FIXME: How many drawables can we do (should we do)? */
+
+diff --git git/src/msm-driver.c git/src/msm-driver.c
+index 803197f..15378f8 100755
+--- git/src/msm-driver.c
++++ git/src/msm-driver.c
+@@ -399,7 +399,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
+
+ /* Get the fixed info (par) structure */
+
+- if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fixed_info)) {
++ if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fix)) {
+ xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+ "Unable to read hardware info from %s: %s\n",
+ dev, strerror(errno));
+@@ -410,7 +410,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
+ /* Parse the ID and figure out what version of the MDP and what
+ * panel ID we have */
+
+- if (sscanf(pMsm->fixed_info.id, "msmfb%d_%x", &mdpver, &panelid) < 2) {
++ if (sscanf(pMsm->fix.id, "msmfb%d_%x", &mdpver, &panelid) < 2) {
+
+ xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+ "Unable to determine the MDP and panel type\n");
+@@ -435,7 +435,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
+ * the fbdev driver to allocate memory. In the mean time, we
+ * just reuse the framebuffer memory */
+
+- pScrn->videoRam = pMsm->fixed_info.smem_len;
++ pScrn->videoRam = pMsm->fix.smem_len;
+
+ /* Get the current screen setting */
+ if (ioctl(pMsm->fd, FBIOGET_VSCREENINFO, &pMsm->mode_info)) {
+@@ -671,8 +671,8 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
+ /* The framebuffer driver should always report the line length,
+ * but in case it doesn't, we can calculate it ourselves */
+
+- if (pMsm->fixed_info.line_length) {
+- pScrn->displayWidth = pMsm->fixed_info.line_length;
++ if (pMsm->fix.line_length) {
++ pScrn->displayWidth = pMsm->fix.line_length;
+ } else {
+ pScrn->displayWidth = pMsm->mode_info.xres_virtual *
+ pMsm->mode_info.bits_per_pixel / 8;
+@@ -811,7 +811,7 @@ MSMCloseScreen(int scrnIndex, ScreenPtr pScreen)
+ #endif
+
+ /* Unmap the framebuffer memory */
+- munmap(pMsm->fbmem, pMsm->fixed_info.smem_len);
++ munmap(pMsm->fbmem, pMsm->fix.smem_len);
+
+ pScreen->CloseScreen = pMsm->CloseScreen;
+
+@@ -857,7 +857,7 @@ MSMScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
+ #endif // defined (MSMFB_GET_PAGE_PROTECTION) && defined (MSMFB_SET_PAGE_PROTECTION)
+
+ /* Map the framebuffer memory */
+- pMsm->fbmem = mmap(NULL, pMsm->fixed_info.smem_len,
++ pMsm->fbmem = mmap(NULL, pMsm->fix.smem_len,
+ PROT_READ | PROT_WRITE, MAP_SHARED, pMsm->fd, 0);
+
+ /* If we can't map the memory, then this is a short trip */
+diff --git git/src/msm-exa.c git/src/msm-exa.c
+index 301923f..ce16a93 100755
+--- git/src/msm-exa.c
++++ git/src/msm-exa.c
+@@ -740,8 +740,8 @@ MSMSetupExa(ScreenPtr pScreen)
+ pExa->flags = EXA_OFFSCREEN_PIXMAPS;
+
+ pExa->offScreenBase =
+- (pMsm->fixed_info.line_length * pMsm->mode_info.yres);
+- pExa->memorySize = pMsm->fixed_info.smem_len;
++ (pMsm->fix.line_length * pMsm->mode_info.yres);
++ pExa->memorySize = pMsm->fix.smem_len;
+
+ /* Align pixmap offsets along page boundaries */
+ pExa->pixmapOffsetAlign = 4096;
+diff --git git/src/msm.h git/src/msm.h
+index e1e2bc7..520d390 100755
+--- git/src/msm.h
++++ git/src/msm.h
+@@ -85,7 +85,7 @@ typedef struct _MSMRec
+ int fd;
+
+ /* Fixed and var strutures from the framebuffer */
+- struct fb_fix_screeninfo fixed_info;
++ struct fb_fix_screeninfo fix;
+ struct fb_var_screeninfo mode_info;
+
+ /* Pointer to the mapped framebuffer memory */
diff --git a/recipes/xorg-driver/xf86-video-msm_git.bb b/recipes/xorg-driver/xf86-video-msm_git.bb
index faccb87e35..4723b867f0 100644
--- a/recipes/xorg-driver/xf86-video-msm_git.bb
+++ b/recipes/xorg-driver/xf86-video-msm_git.bb
@@ -8,9 +8,15 @@ SRCREV = "5f7df59155ae301a3ebc40aec22ed16d203cb5fc"
PV = "1.1.0+${PR}+gitr${SRCREV}"
PE = "1"
-SRC_URI = "git://codeaurora.org/quic/xwin/xf86-video-msm.git;protocol=git\
- "
+SRC_URI = "git://codeaurora.org/quic/xwin/xf86-video-msm.git;protocol=git"
+SRC_URI_htcdream = "git://codeaurora.org/quic/xwin/xf86-video-msm.git;protocol=git \
+ file://no_neon.patch;patch=1 \
+ file://no_neon_flags.patch;patch=1 \
+ file://renaming_variables.patch;patch=1"
S = "${WORKDIR}/git"
CFLAGS += " -I${STAGING_INCDIR}/xorg "
+CFLAGS += " -Wno-error "
+
+ARM_INSTRUCTION_SET="arm"