xf86-video-msm: fix build errors

author: David LanzendÃ¶rfer <david.lanzendoerfer@o2s.ch> 2010-03-17 21:25:38 +0100
committer: Lukas Gorris <lukas.gorris@gmail.com> 2010-03-17 21:25:38 +0100
commit: d695f337bdaa0e297ad89c6fdd99edf97bc270db (patch)
tree: 23080a6660c4a5e76c82ef47b587e4c304bdf43b /recipes/xorg-driver
parent: 6c5f9d4325be253fb3cb9b6d3bec11f7a7a13562 (diff)
download: openembedded-d695f337bdaa0e297ad89c6fdd99edf97bc270db.tar.gz
4 files changed, 3061 insertions, 2 deletions
diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon.patch b/recipes/xorg-driver/xf86-video-msm/no_neon.patch
new file mode 100644
index 0000000000..c0aa92e76a
--- /dev/null
+++ b/recipes/xorg-driver/xf86-video-msm/no_neon.patch
@@ -0,0 +1,2901 @@
+commit d8910bf773fbecf7cdea359d4b530a3672e27180
+Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
+Date:   Wed Feb 10 16:18:39 2010 +0100
+
+    Removed neon because its not available in our kerneÃl
+    and so its causing trubble (Illegal instruction)
+
+diff --git git/src/msm-swblits.h git/src/msm-swblits.h
+index f89f00e..a40b24b 100755
+--- git/src/msm-swblits.h
++++ git/src/msm-swblits.h
+@@ -38,16 +38,6 @@
+ #include <stdint.h>
+ #include <stdlib.h>
+ 
+-/* Neon intrinsics are part of the ARM or GCC compiler used.                                                              */
+-/* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */
+-#include <arm_neon.h>
+-
+-/* These are NEON-optimized functions linked to by various tests.  */
+-extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes);
+-extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes);
+-extern void memset16(uint16_t *dst, uint16_t value, int count);
+-extern void memset32(uint32_t *dst, uint32_t value, int count);
+-
+ /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */
+ #define BITS_PER_BYTE (8)
+ #define BYTES_PER_16BPP_PIXEL (2)
+diff --git git/src/msm-swfill.c git/src/msm-swfill.c
+index 108fd94..3dd1ef2 100755
+--- git/src/msm-swfill.c
++++ git/src/msm-swfill.c
+@@ -212,7 +212,7 @@ memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int cou
+    }
+ }
+ 
+-
++/*
+ static inline void
+ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
+ {
+@@ -333,7 +333,7 @@ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
+    // Quickly fill remaining pixels (up to 7).
+    memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count);
+ }
+-
++*/
+ 
+ static inline void
+ memset16_Test(uint16_t *dst, uint16_t src, int count)
+@@ -368,7 +368,8 @@ memset16_Test(uint16_t *dst, uint16_t src, int count)
+ 
+       // Copy remaining pixels using Neon and non-Neon instructions.
+       // NOTE: This assumes that dst is aligned optimally for Neon instructions.
+-      memset16_AssumesNeonAlignment((void *) dst, src, count);
++      //memset16_AssumesNeonAlignment((void *) dst, src, count);
++      memset((void *) dst, src, count);
+    }
+ }
+ 
+@@ -435,12 +436,14 @@ swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dp
+    if (w < 32) {
+       // For narrow rectangles, block signals only once for the entire rectangles.
+       BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+-      DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
++      //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
++      DO_MULTIPLE_FILLS_WITH_MEMSET(memset,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
+       UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+    }
+    else {
+       // For wider rectangles, block and unblock signals for every row.
+-      DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++      //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++      DO_MULTIPLE_FILLS_WITH_MEMSET(memset,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+    }
+ }
+ 
+diff --git git/src/msm-swrender.c git/src/msm-swrender.c
+index a7a9abc..835dc03 100755
+--- git/src/msm-swrender.c
++++ git/src/msm-swrender.c
+@@ -214,160 +214,6 @@ swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src
+                   }
+                }
+                break;
+-      case  7: if (xdir >= 0) {
+-                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
+-                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
+-                  return TRUE;
+-               } else {
+-                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
+-                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
+-                  return TRUE;
+-               }
+-               break;
+-      case  8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
+-                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint32_t src1  = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2  = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3  = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4  = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
+-                  *(uint32_t *) (dst+0*BYTES_PER_UINT32_T)  = src1;
+-                  *(uint32_t *) (dst+1*BYTES_PER_UINT32_T)  = src2;
+-                  *(uint32_t *) (dst+2*BYTES_PER_UINT32_T)  = src3;
+-                  *(uint32_t *) (dst+3*BYTES_PER_UINT32_T)  = src4;
+-                  return TRUE;
+-               }
+-               else {
+-                  uint16_t src1  = *(uint16_t *) (src+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2  = *(uint16_t *) (src+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3  = *(uint16_t *) (src+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4  = *(uint16_t *) (src+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5  = *(uint16_t *) (src+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6  = *(uint16_t *) (src+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7  = *(uint16_t *) (src+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8  = *(uint16_t *) (src+7*BYTES_PER_UINT16_T);
+-                  *(uint16_t *) (dst+0*BYTES_PER_UINT16_T)  = src1;
+-                  *(uint16_t *) (dst+1*BYTES_PER_UINT16_T)  = src2;
+-                  *(uint16_t *) (dst+2*BYTES_PER_UINT16_T)  = src3;
+-                  *(uint16_t *) (dst+3*BYTES_PER_UINT16_T)  = src4;
+-                  *(uint16_t *) (dst+4*BYTES_PER_UINT16_T)  = src5;
+-                  *(uint16_t *) (dst+5*BYTES_PER_UINT16_T)  = src6;
+-                  *(uint16_t *) (dst+6*BYTES_PER_UINT16_T)  = src7;
+-                  *(uint16_t *) (dst+7*BYTES_PER_UINT16_T)  = src8;
+-                  return TRUE;
+-               }
+-               break;
+-      case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+-                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3  = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4  = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
+-                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
+-                  *(uint64_t *) (dst+2*BYTES_PER_UINT64_T)  = src3;
+-                  *(uint64_t *) (dst+3*BYTES_PER_UINT64_T)  = src4;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint32_t src1  = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2  = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3  = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4  = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5  = *(uint32_t *) (src+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6  = *(uint32_t *) (src+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7  = *(uint32_t *) (src+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8  = *(uint32_t *) (src+7*BYTES_PER_UINT32_T);
+-                  *(uint32_t *) (dst+0*BYTES_PER_UINT32_T)  = src1;
+-                  *(uint32_t *) (dst+1*BYTES_PER_UINT32_T)  = src2;
+-                  *(uint32_t *) (dst+2*BYTES_PER_UINT32_T)  = src3;
+-                  *(uint32_t *) (dst+3*BYTES_PER_UINT32_T)  = src4;
+-                  *(uint32_t *) (dst+4*BYTES_PER_UINT32_T)  = src5;
+-                  *(uint32_t *) (dst+5*BYTES_PER_UINT32_T)  = src6;
+-                  *(uint32_t *) (dst+6*BYTES_PER_UINT32_T)  = src7;
+-                  *(uint32_t *) (dst+7*BYTES_PER_UINT32_T)  = src8;
+-                  return TRUE;
+-               }
+-               else {
+-                  // Don't bother unrolling loops here, since that won't help for more than around 8 operations.
+-                  // Instead, just call multiple fixed functions.
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
+-                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
+-                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
+-                  }
+-                  return TRUE;
+-               }
+-               break;
+-      case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+-                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
+-                  vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
+-                  vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3  = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4  = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
+-                  uint64_t src5  = *(uint64_t *) (src+4*BYTES_PER_UINT64_T);
+-                  uint64_t src6  = *(uint64_t *) (src+5*BYTES_PER_UINT64_T);
+-                  uint64_t src7  = *(uint64_t *) (src+6*BYTES_PER_UINT64_T);
+-                  uint64_t src8  = *(uint64_t *) (src+7*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
+-                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
+-                  *(uint64_t *) (dst+2*BYTES_PER_UINT64_T)  = src3;
+-                  *(uint64_t *) (dst+3*BYTES_PER_UINT64_T)  = src4;
+-                  *(uint64_t *) (dst+4*BYTES_PER_UINT64_T)  = src5;
+-                  *(uint64_t *) (dst+5*BYTES_PER_UINT64_T)  = src6;
+-                  *(uint64_t *) (dst+6*BYTES_PER_UINT64_T)  = src7;
+-                  *(uint64_t *) (dst+7*BYTES_PER_UINT64_T)  = src8;
+-                  return TRUE;
+-               }
+-               break;
+-      case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+-                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
+-                  vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
+-                  vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
+-                  vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5);
+-                  vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6);
+-                  vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7);
+-                  vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8);
+-                  return TRUE;
+-               }
+-               break;
+    }
+ 
+    return FALSE;
+@@ -519,427 +365,6 @@ swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *sr
+                }
+                return TRUE;
+                break;
+-      case  7: if (xdir >= 0) {
+-                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+-                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+-               } else {
+-                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+-                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+-               }
+-               return TRUE;
+-               break;
+-      case  8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+-                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
+-                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
+-                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
+-                  return TRUE;
+-               }
+-               else {
+-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
+-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
+-                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
+-                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
+-                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
+-                  *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T)  = src4a;
+-                  *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T)  = src5a;
+-                  *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T)  = src6a;
+-                  *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T)  = src7a;
+-                  *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T)  = src8a;
+-                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
+-                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
+-                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
+-                  *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T)  = src4b;
+-                  *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T)  = src5b;
+-                  *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T)  = src6b;
+-                  *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T)  = src7b;
+-                  *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T)  = src8b;
+-                  return TRUE;
+-               }
+-               break;
+-      case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
+-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
+-                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
+-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
+-                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
+-                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) {
+-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+-                  *(uint32_t *) (dst+0*dpitch+0)                                        = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3a;
+-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5a;
+-                  *(uint32_t *) (dst+1*dpitch+0)                                        = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2b;
+-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3b;
+-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5b;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
+-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
+-                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T)  = src5a;
+-                  *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T)  = src6a;
+-                  *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T)  = src7a;
+-                  *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T)  = src8a;
+-                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
+-                  *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T)  = src5b;
+-                  *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T)  = src6b;
+-                  *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T)  = src7b;
+-                  *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T)  = src8b;
+-                  return TRUE;
+-               }
+-               else {
+-                  // Don't bother unrolling loops, since that won't help for more than around 8 operations.
+-                  // Instead, just call multiple fixed functions.
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               break;
+-      case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                           src + 0,                            4, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T,      src + (4)*BYTES_PER_UINT16_T,      16, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T,   src + (16+4)*BYTES_PER_UINT16_T,    8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T,  4, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T,  4, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T,   src + (16+4)*BYTES_PER_UINT16_T,    8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T,      src + (4)*BYTES_PER_UINT16_T,      16, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                           src + 0,                            4, xdir, dpitch, spitch);
+-                   }
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
+-                  uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T);
+-                  uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T);
+-                  uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T);
+-                  uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T);
+-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
+-                  uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T);
+-                  uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T);
+-                  uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T);
+-                  uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
+-                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
+-                  *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T)  = src5a;
+-                  *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T)  = src6a;
+-                  *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T)  = src7a;
+-                  *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T)  = src8a;
+-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
+-                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
+-                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
+-                  *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T)  = src5b;
+-                  *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T)  = src6b;
+-                  *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T)  = src7b;
+-                  *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T)  = src8b;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               else {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               break;
+-      case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4_T),src6a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+6*BYTES_PER_UINT32X4_T),src7a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+7*BYTES_PER_UINT32X4_T),src8a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+4*BYTES_PER_UINT32X4_T),src5b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+5*BYTES_PER_UINT32X4_T),src6b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+6*BYTES_PER_UINT32X4_T),src7b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+7*BYTES_PER_UINT32X4_T),src8b);
+-                  return TRUE;
+-               }//HERE
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                             src + 0,                             4,    xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T,   src + (0*16+4)*BYTES_PER_UINT16_T,   2*16, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T,   src + (2*16+4)*BYTES_PER_UINT16_T,   16,   xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T,   src + (3*16+4)*BYTES_PER_UINT16_T,   8,    xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4,    xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4,    xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T,   src + (3*16+4)*BYTES_PER_UINT16_T,   8,    xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T,   src + (2*16+4)*BYTES_PER_UINT16_T,   16,   xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T,   src + (0*16+4)*BYTES_PER_UINT16_T,   2*16, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                             src + 0,                             4,    xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               break;
+    }
+ 
+    return FALSE;
+@@ -1161,484 +586,7 @@ swCopy16BppSmallFixedWidths4Rows_Unaligned(unsigned char *dst, unsigned char *sr
+                }
+                return TRUE;
+                break;
+-      case  7: if (xdir >= 0) {
+-                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+-                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+-               } else {
+-                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+-                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+-               }
+-               return TRUE;
+-               break;
+-      // TODO: Add more alignment checks for 8 pixel-wide cases for performance reasons?
+-      //       For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,DOUBLE_WORD_ALIGNMENT_BYTE_SIZE/2)) and related half-aligned cases...
+-      case  8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
+-                  *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T)  = src1c;
+-                  *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T)  = src2c;
+-                  *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T)  = src1d;
+-                  *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T)  = src2d;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
+-                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
+-                  *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T)  = src1c;
+-                  *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T)  = src2c;
+-                  *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T)  = src3c;
+-                  *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T)  = src4c;
+-                  *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T)  = src1d;
+-                  *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T)  = src2d;
+-                  *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T)  = src3d;
+-                  *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T)  = src4d;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
+-                  uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint16_t src5c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
+-                  uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint16_t src5d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
+-                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
+-                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
+-                  *(uint16_t *) (dst+2*dpitch+0)                                        = src1c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4c;
+-                  *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5c;
+-                  *(uint16_t *) (dst+3*dpitch+0)                                        = src1d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4d;
+-                  *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5d;
+-                  return TRUE;
+-               }
+-               else {
+-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
+-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
+-                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5c = *(uint16_t *) (src+2*spitch+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6c = *(uint16_t *) (src+2*spitch+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7c = *(uint16_t *) (src+2*spitch+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8c = *(uint16_t *) (src+2*spitch+7*BYTES_PER_UINT16_T);
+-                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5d = *(uint16_t *) (src+3*spitch+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6d = *(uint16_t *) (src+3*spitch+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7d = *(uint16_t *) (src+3*spitch+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8d = *(uint16_t *) (src+3*spitch+7*BYTES_PER_UINT16_T);
+-                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
+-                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
+-                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
+-                  *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T)  = src4a;
+-                  *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T)  = src5a;
+-                  *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T)  = src6a;
+-                  *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T)  = src7a;
+-                  *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T)  = src8a;
+-                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
+-                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
+-                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
+-                  *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T)  = src4b;
+-                  *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T)  = src5b;
+-                  *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T)  = src6b;
+-                  *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T)  = src7b;
+-                  *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T)  = src8b;
+-                  *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T)  = src1c;
+-                  *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T)  = src2c;
+-                  *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T)  = src3c;
+-                  *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T)  = src4c;
+-                  *(uint16_t *) (dst+2*dpitch+4*BYTES_PER_UINT16_T)  = src5c;
+-                  *(uint16_t *) (dst+2*dpitch+5*BYTES_PER_UINT16_T)  = src6c;
+-                  *(uint16_t *) (dst+2*dpitch+6*BYTES_PER_UINT16_T)  = src7c;
+-                  *(uint16_t *) (dst+2*dpitch+7*BYTES_PER_UINT16_T)  = src8c;
+-                  *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T)  = src1d;
+-                  *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T)  = src2d;
+-                  *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T)  = src3d;
+-                  *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T)  = src4d;
+-                  *(uint16_t *) (dst+3*dpitch+4*BYTES_PER_UINT16_T)  = src5d;
+-                  *(uint16_t *) (dst+3*dpitch+5*BYTES_PER_UINT16_T)  = src6d;
+-                  *(uint16_t *) (dst+3*dpitch+6*BYTES_PER_UINT16_T)  = src7d;
+-                  *(uint16_t *) (dst+3*dpitch+7*BYTES_PER_UINT16_T)  = src8d;
+-                  return TRUE;
+-               }
+-               break;
+-      case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
+-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
+-                  uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3c = *(uint64_t *) (src+2*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4c = *(uint64_t *) (src+2*spitch+3*BYTES_PER_UINT64_T);
+-                  uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3d = *(uint64_t *) (src+3*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4d = *(uint64_t *) (src+3*spitch+3*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
+-                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
+-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
+-                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
+-                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
+-                  *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T)  = src1c;
+-                  *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T)  = src2c;
+-                  *(uint64_t *) (dst+2*dpitch+2*BYTES_PER_UINT64_T)  = src3c;
+-                  *(uint64_t *) (dst+2*dpitch+3*BYTES_PER_UINT64_T)  = src4c;
+-                  *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T)  = src1d;
+-                  *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T)  = src2d;
+-                  *(uint64_t *) (dst+3*dpitch+2*BYTES_PER_UINT64_T)  = src3d;
+-                  *(uint64_t *) (dst+3*dpitch+3*BYTES_PER_UINT64_T)  = src4d;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
+-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+-                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0);
+-                  uint64_t src2c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+-                  uint64_t src3c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+-                  uint64_t src4c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+-                  uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+-                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0);
+-                  uint64_t src2d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+-                  uint64_t src3d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+-                  uint64_t src4d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+-                  uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+-                  *(uint32_t *) (dst+0*dpitch+0)                                        = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3a;
+-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5a;
+-                  *(uint32_t *) (dst+1*dpitch+0)                                        = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2b;
+-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3b;
+-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5b;
+-                  *(uint32_t *) (dst+2*dpitch+0)                                        = src1c;
+-                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2c;
+-                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3c;
+-                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5c;
+-                  *(uint32_t *) (dst+3*dpitch+0)                                        = src1d;
+-                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2d;
+-                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3d;
+-                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5d;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
+-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
+-                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5c = *(uint32_t *) (src+2*spitch+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6c = *(uint32_t *) (src+2*spitch+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7c = *(uint32_t *) (src+2*spitch+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8c = *(uint32_t *) (src+2*spitch+7*BYTES_PER_UINT32_T);
+-                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5d = *(uint32_t *) (src+3*spitch+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6d = *(uint32_t *) (src+3*spitch+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7d = *(uint32_t *) (src+3*spitch+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8d = *(uint32_t *) (src+3*spitch+7*BYTES_PER_UINT32_T);
+-                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T)  = src5a;
+-                  *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T)  = src6a;
+-                  *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T)  = src7a;
+-                  *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T)  = src8a;
+-                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
+-                  *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T)  = src5b;
+-                  *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T)  = src6b;
+-                  *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T)  = src7b;
+-                  *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T)  = src8b;
+-                  *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T)  = src1c;
+-                  *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T)  = src2c;
+-                  *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T)  = src3c;
+-                  *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T)  = src4c;
+-                  *(uint32_t *) (dst+2*dpitch+4*BYTES_PER_UINT32_T)  = src5c;
+-                  *(uint32_t *) (dst+2*dpitch+5*BYTES_PER_UINT32_T)  = src6c;
+-                  *(uint32_t *) (dst+2*dpitch+6*BYTES_PER_UINT32_T)  = src7c;
+-                  *(uint32_t *) (dst+2*dpitch+7*BYTES_PER_UINT32_T)  = src8c;
+-                  *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T)  = src1d;
+-                  *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T)  = src2d;
+-                  *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T)  = src3d;
+-                  *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T)  = src4d;
+-                  *(uint32_t *) (dst+3*dpitch+4*BYTES_PER_UINT32_T)  = src5d;
+-                  *(uint32_t *) (dst+3*dpitch+5*BYTES_PER_UINT32_T)  = src6d;
+-                  *(uint32_t *) (dst+3*dpitch+6*BYTES_PER_UINT32_T)  = src7d;
+-                  *(uint32_t *) (dst+3*dpitch+7*BYTES_PER_UINT32_T)  = src8d;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint32_t src6a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
+-                  uint32_t src7a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
+-                  uint32_t src8a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
+-                  uint16_t src9a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
+-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint32_t src6b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
+-                  uint32_t src7b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
+-                  uint32_t src8b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
+-                  uint16_t src9b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
+-                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
+-                  uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint32_t src6c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
+-                  uint32_t src7c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
+-                  uint32_t src8c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
+-                  uint16_t src9c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
+-                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
+-                  uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint32_t src6d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
+-                  uint32_t src7d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
+-                  uint32_t src8d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
+-                  uint16_t src9d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
+-                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8a;
+-                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9a;
+-                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8b;
+-                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9b;
+-                  *(uint16_t *) (dst+2*dpitch+0)                                        = src1c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8c;
+-                  *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9c;
+-                  *(uint16_t *) (dst+3*dpitch+0)                                        = src1d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8d;
+-                  *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9d;
+-                  return TRUE;
+-               }
+-               else {
+-                  // Don't bother unrolling loops, since that won't help for more than around 8 operations.
+-                  // Instead, just call multiple fixed functions.
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               break;
+-      // TODO: Add more alignment checks for 32 pixel-wide cases for performance reasons?
+-      //       For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,XXX)) and related cases could make a big difference here...
+-      case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3c = vld1q_u32((uint32_t *)(src+2*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4c = vld1q_u32((uint32_t *)(src+2*spitch+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3d = vld1q_u32((uint32_t *)(src+3*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4d = vld1q_u32((uint32_t *)(src+3*spitch+3*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+2*BYTES_PER_UINT32X4_T),src3c);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+3*BYTES_PER_UINT32X4_T),src4c);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+2*BYTES_PER_UINT32X4_T),src3d);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+3*BYTES_PER_UINT32X4_T),src4d);
+-                  return TRUE;
+-               }
+-               break;
+-   }
++         }
+ 
+    return FALSE;
+ }
+@@ -1924,10 +872,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int
+       if (rowsOverlap)
+       {
+          if (w > 64) {
+-            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++	    DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+          }
+          else if (w == 64) {
+-            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++	    DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+          }
+          else {
+             DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
+@@ -1936,10 +886,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int
+       else
+       {
+          if (w > 64) {
+-            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++	    DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+          }
+          else if (w == 64) {
+-            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++	    DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+          }
+          else {
+             DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
+@@ -1973,7 +925,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i
+       if (xdir >= 0 || !rowsOverlap) {
+          if (w >= 128) {
+             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+-            neon_memcpy(dst, src, w);
++            //neon_memcpy(dst, src, w);
++	    memcpy(dst, src, w);
+             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+          }
+          else
+@@ -1982,7 +935,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i
+       else {
+          if (w >= 128) {
+             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+-            neon_memmove(dst, src, w);
++            //neon_memmove(dst, src, w);
++	    memmove(dst, src, w);
+             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+          }
+          else
+@@ -2029,7 +983,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h,
+       if (xdir >= 0 || !rowsOverlap) {
+          if (w >= 42) {
+             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+-            neon_memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
++            //neon_memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
++	    memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
+             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+          }
+          else
+@@ -2038,7 +993,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h,
+       else {
+          if (w >= 42) {
+             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+-            neon_memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
++            //neon_memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
++	    memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
+             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+          }
+          else
+diff --git git/src/neon_memcpy.S git/src/neon_memcpy.S
+deleted file mode 100644
+index 5ecc5ce..0000000
+--- git/src/neon_memcpy.S
++++ /dev/null
+@@ -1,549 +0,0 @@
+-/***************************************************************************
+- Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+-
+- Redistribution and use in source and binary forms, with or without
+- modification, are permitted provided that the following conditions are met:
+-     * Redistributions of source code must retain the above copyright
+-       notice, this list of conditions and the following disclaimer.
+-     * Redistributions in binary form must reproduce the above copyright
+-       notice, this list of conditions and the following disclaimer in the
+-       documentation and/or other materials provided with the distribution.
+-     * Neither the name of Code Aurora nor the names of its contributors may
+-       be used to endorse or promote products derived from this software
+-       without specific prior written permission.
+-
+- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+- POSSIBILITY OF SUCH DAMAGE.
+- ***************************************************************************/
+-
+-/***************************************************************************
+-  Neon memcpy: Attempts to do a memcpy with Neon registers if possible,
+-     Inputs:
+-        dest: The destination buffer
+-        src: The source buffer
+-        n: The size of the buffer to transfer
+-     Outputs:
+-
+-***************************************************************************/
+-
+-/*
+- * General note:
+- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
+- * However, it looks like the 2006 CodeSourcery Assembler has issues generating
+- * the correct object code for VPOP, resulting in horrific stack crashes.
+- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
+- * and VPOP->VLDMIA.  We can revert this back once we update our toolchain.
+- *
+- * Also, VSHL swaps the source register and the shift-amount register
+- * around in 2006-q3.  I've coded this incorrectly so it turns out correct
+- * in the object code, but we'll need to undo that later...
+- */
+-
+-	.code 32
+-	.align 4
+-	.globl neon_memcpy
+-	.func
+-
+-neon_memcpy:
+-	/* 
+-	 * First, make sure we're not copying < 4 bytes.  If so, we'll
+-         * just handle it here.
+-	 */
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	stmdb		sp!, {r0}
+-#else
+-	push		{r0}
+-#endif
+-	cmp		r2, #4
+-	bgt		neon_gt_4
+-	/* Copy 0-4 bytes, if needed, and return.*/
+-	cmp		r2, #0
+-neon_smallcopy_loop:
+-	beq		neon_smallcopy_done
+-	ldrb		r12, [r1], #1
+-	subs		r2, r2, #1
+-	strb		r12, [r0], #1
+-	b		neon_smallcopy_loop
+-neon_smallcopy_done:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	ldmia		sp!, {r0}
+-#else
+-	pop		{r0}
+-#endif
+-	bx		lr
+-
+-	/* Copy 4 or more bytes*/
+-neon_gt_4:
+-	/* Preload what we can...*/
+-	pld		[r0,#0]
+-	pld		[r1,#0]
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	stmdb		sp!, {r4-r5}
+-#else
+-	push		{r4-r5}
+-#endif
+-
+-neon_check_align:
+-	/* Check normal word alignment for target. */
+-	ands		r12, r0, #0x3
+-	beq		source_alignment_check
+-	
+-	/*
+-	 * Target is not aligned.  Step through until we get that
+-	 * word-aligned.  This works better than a loop, according
+-	 * to our pipeline modeler.
+-	 */
+-	cmp		r12, #2
+-	ldrb		r3, [r1], #1
+-	ldrleb		r4, [r1], #1
+-	ldrltb		r5, [r1], #1
+-	rsb		r12, r12, #4
+-	sub		r2, r2, r12
+-	strb		r3, [r0], #1
+-	strleb		r4, [r0], #1
+-	strltb		r5, [r0], #1
+-	
+-source_alignment_check:
+-	ands		r12, r1, #0x3
+-	bne		neon_memcpy_nonaligned	/* Source is not word aligned.*/
+-neon_try_16_align:
+-	cmp		r2, #64
+-	blt		neon_align_route
+-	/* This is where we try 16-byte alignment. */
+-	ands		r12, r0, #0xf
+-	beq		neon_align_route
+-	rsb		r12, r12, #16
+-neon_16_start:
+-	sub		r2, r2, r12
+-	lsrs		r3, r12, #2
+-neon_align_16_4:
+-	ldr		r4, [r1], #4
+-	subs		r3, r3, #1
+-	str		r4, [r0], #4
+-	bne		neon_align_16_4
+-neon_align_route:
+-	/* In this case, both source and target are word-aligned. */
+-	cmp		r2, #32768
+-	bge		neon_copy_128p_a
+-	cmp		r2, #256
+-	bge		neon_copy_128_a
+-	cmp		r2, #64
+-	bge		neon_copy_32_a
+-	b		neon_copy_finish_a
+-	nop
+-neon_copy_128p_a:
+-	/* We'll copy blocks 128-bytes at a time, but try to call pld to
+-	 * load in the next page, if possible.
+-	 */
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vstmdb		sp!, {q4-q7}
+-#else
+-	vpush		{q4-q7}
+-#endif
+-	mov		r12, r2, lsr #7
+-neon_copy_128p_loop_a:
+-	vld1.32		{q0, q1}, [r1]!
+-	vld1.32		{q2, q3}, [r1]!
+-	vld1.32		{q4, q5}, [r1]!
+-	vld1.32		{q6, q7}, [r1]!
+-	pld		[r1, #0]
+-	pld		[r1, #1024]
+-	vst1.32		{q0, q1}, [r0]!
+-	vst1.32		{q2, q3}, [r0]!
+-	vst1.32		{q4, q5}, [r0]!
+-	vst1.32		{q6, q7}, [r0]!
+-	subs		r12, r12, #1
+-	bne		neon_copy_128p_loop_a
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vldmia		sp!, {q4-q7}
+-#else
+-	vpop		{q4-q7}
+-#endif
+-	ands		r2, r2, #0x7f
+-	beq		neon_end
+-	cmp		r2, #32
+-	blt		neon_copy_finish_a
+-	b		neon_copy_32_a
+-	/* Copy blocks of 128-bytes (word-aligned) at a time*/
+-neon_copy_128_a:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vstmdb		sp!, {q4-q7}
+-#else
+-	vpush		{q4-q7}
+-#endif
+-	/*
+-	 * Move to a 1-s based countdown to determine when to loop.  That
+-	 * allows the subs to set the Z flag without having to explicitly
+-	 * call cmp to a value.
+-	 */
+-	mov		r12, r2, lsr #7
+-neon_copy_128_loop_a:
+-	vld1.32		{q0, q1}, [r1]!
+-	vld1.32		{q2, q3}, [r1]!
+-	vld1.32		{q4, q5}, [r1]!
+-	vld1.32		{q6, q7}, [r1]!
+-	pld		[r1, #0]
+-	pld		[r1, #128]
+-	vst1.32		{q0, q1}, [r0]!
+-	vst1.32		{q2, q3}, [r0]!
+-	vst1.32		{q4, q5}, [r0]!
+-	vst1.32		{q6, q7}, [r0]!
+-	subs		r12, r12, #1
+-	pld		[r0, #0]
+-	pld		[r0, #128]
+-	bne		neon_copy_128_loop_a
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vldmia		sp!, {q4-q7}
+-#else
+-	vpop		{q4-q7}
+-#endif
+-	ands		r2, r2, #0x7f
+-	beq		neon_end
+-	cmp		r2, #32
+-	blt		neon_copy_finish_a
+-	/* Copy blocks of 32-bytes (word aligned) at a time*/
+-neon_copy_32_a:
+-	mov		r12, r2, lsr #5
+-neon_copy_32_loop_a:
+-	vld1.32		{q0,q1}, [r1]!
+-	subs		r12, r12, #1
+-	pld		[r1,#0]
+-	vst1.32		{q0,q1}, [r0]!
+-	bne		neon_copy_32_loop_a
+-	ands		r2, r2, #0x1f
+-	beq		neon_end
+-neon_copy_finish_a:
+-neon_copy_16_a:
+-	movs		r12, r2, lsr #4
+-	beq		neon_copy_8_a
+-neon_copy_16_a_loop:
+-	vld1.32		{q0}, [r1]!
+-	subs		r12, r12, #1
+-	vst1.32		{q0}, [r0]!
+-	bne		neon_copy_16_a_loop
+-	ands		r2, r2, #0xf
+-	beq		neon_end
+-neon_copy_8_a:
+-	cmp		r2, #8
+-	blt		neon_copy_4_a
+-	ldm		r1!, {r4-r5}
+-	subs		r2, r2, #8
+-	stm		r0!, {r4-r5}
+-	/* Copy 4-bytes of word-aligned data at a time*/
+-neon_copy_4_a:
+-	cmp		r2, #4
+-	blt		neon_copy_finish
+-	ldr		r4, [r1], #4
+-	subs		r2, r2, #4
+-	str		r4, [r0], #4
+-	b		neon_copy_finish
+-
+-	/*
+-	 * Handle unaligned data.  The basic concept here is that we'll
+-	 * try to pull out enough data from the source to get that word-
+-	 * aligned, then do our writes word-aligned, storing the difference
+-	 * in a register, and shifting the data as needed.
+-	 */
+-neon_memcpy_nonaligned:
+-	/*
+-	 * If this is <8 bytes, it makes more sense to just copy it
+-	 * quickly instead of incurring all kinds of overhead.
+-	 */
+-	cmp		r2, #8 /* Let's try this...*/
+-	ble		neon_copy_finish
+-	/*
+-	 * This is where we'll pull out either 1, 2, or 3 bytes of data
+-	 * from the source as needed to align it, then store off those
+-	 * bytes in r4.  When we read in the (now) aligned data from the
+-	 * source, we'll shift the bytes and AND in the r4 data, then write
+-	 * to the target aligned.
+-	 *
+-	 * The conditional ldr calls work slightly faster than the
+-	 * previous method, confirmed by our pipeline modeler.
+-	 */
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	stmdb		sp!, {r6-r9}
+-#else
+-	push		{r6-r9}
+-#endif
+-	cmp		r12, #2
+-	ldrb		r4, [r1], #1
+-	ldrleb		r5, [r1], #1
+-	ldrltb		r6, [r1], #1
+-	rsb		r8, r12, #4
+-	sub		r2, r2, r8
+-	lsl		r8, r8, #3
+-	orrle		r4, r4, r5, lsl #8
+-	orrlt		r4, r4, r6, lsl #16
+-	rsb		r9, r8, #32
+-	
+-	cmp		r2, #64
+-	blt		neon_unaligned_route
+-	ands		r12, r0, #0xf
+-	beq		neon_unaligned_route
+-	rsb		r12, r12, #16
+-neon_16_start_u:
+-	sub		r2, r2, r12
+-	lsrs		r6, r12, #2
+-neon_align_16_4_u:
+-	ldr		r5, [r1], #4
+-	subs		r6, r6, #1
+-	orr		r4, r4, r5, lsl r8
+-	str		r4, [r0], #4
+-	mov		r4, r5, lsr r9
+-	bne		neon_align_16_4_u
+-neon_unaligned_route:
+-	/* Decide which loop block to branch to.*/
+-	cmp		r2, #256
+-	bge		neon_copy_64_u
+-	cmp		r2, #64
+-	bge		neon_copy_32_u
+-	b		neon_copy_finish_u
+-	/* Copy data in 64-byte blocks.*/
+-neon_copy_64_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vstmdb		sp!, {q4}
+-	vstmdb		sp!, {q5-q8}
+-#else
+-	vpush		{q4}
+-	vpush		{q5-q8}
+-#endif
+-	/* We'll need this for the q register shift later.*/
+-	vdup.u32	q8, r8
+-	/*
+-	 * As above, we determine how many times we can go through the
+-	 * 64-byte copy loop, then countdown.
+-	 */
+-	mov		r12, r2, lsr #6
+-	and		r2, r2, #0x3f
+-neon_copy_64_u_loop:
+-	/* Load 64-bytes into q4-q7.*/
+-	vld1.32		{q4, q5}, [r1]!
+-	vld1.32		{q6, q7}, [r1]!
+-	/*
+-	 * Shift q0-q3 right so everything but the data we need due to the
+-	 * alignment falls off the right-hand side.  The branching
+-	 * is needed, since vshr requires the shift to be an immediate
+-	 * value.
+-	 */
+-	lsls		r5, r8, #28
+-	bcc		neon_copy_64_u_b8
+-	bpl		neon_copy_64_u_b16
+-	vshr.u64	q0, q4, #40
+-	vshr.u64	q1, q5, #40
+-	vshr.u64	q2, q6, #40
+-	vshr.u64	q3, q7, #40
+-	b		neon_copy_64_unify
+-neon_copy_64_u_b8:
+-	vshr.u64	q0, q4, #56
+-	vshr.u64	q1, q5, #56
+-	vshr.u64	q2, q6, #56
+-	vshr.u64	q3, q7, #56
+-	b		neon_copy_64_unify
+-neon_copy_64_u_b16:
+-	vshr.u64	q0, q4, #48
+-	vshr.u64	q1, q5, #48
+-	vshr.u64	q2, q6, #48
+-	vshr.u64	q3, q7, #48
+-neon_copy_64_unify:
+-	/*
+-	 * Shift q4-q7 left by r8 bits to take the alignment into
+-	 * account.
+-	 */
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vshl.u64	q4, q8, q4
+-	vshl.u64	q5, q8, q5
+-	vshl.u64	q6, q8, q6
+-	vshl.u64	q7, q8, q7
+-#else
+-	vshl.u64	q4, q4, q8
+-	vshl.u64	q5, q5, q8
+-	vshl.u64	q6, q6, q8
+-	vshl.u64	q7, q7, q8
+-#endif
+-	/*
+-	 * The data in s14 will be needed for the next loop iteration.  Move
+-	 * that to r5.
+-	 */
+-	vmov		r5, s14
+-	/* We'll vorr the shifted data with the data that needs to move back.*/
+-	vorr		d9, d9, d0
+-	/* Copy the data from the previous loop into s14.*/
+-	vmov		s14, r4
+-	vorr		d10, d10, d1
+-	vorr		d11, d11, d2
+-	vorr		d12, d12, d3
+-	vorr		d13, d13, d4
+-	vorr		d14, d14, d5
+-	vorr		d15, d15, d6
+-	vorr		d8, d8, d7
+-	subs		r12, r12, #1
+-	pld		[r1, #0]
+-	pld		[r1, #128]
+-	/* Save off the r5 data into r4 for the next iteration.*/
+-	mov		r4, r5
+-	vst1.32		{q4, q5}, [r0]!
+-	vst1.32		{q6, q7}, [r0]!
+-	pld		[r0, #0]
+-	pld		[r0, #128]
+-	bne		neon_copy_64_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vldmia		sp!, {q5-q8}
+-	vldmia		sp!, {q4}
+-#else
+-	vpop		{q5-q8}
+-	vpop		{q4}
+-#endif
+-	cmp		r2, #32
+-	bge		neon_copy_32_u
+-	b		neon_copy_finish_u
+-neon_copy_32_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vstmdb		sp!, {q4}
+-#else
+-	vpush		{q4}
+-#endif
+-	vdup.u32	q4, r8
+-	mov		r12, r2, lsr #5
+-	and		r2, r2, #0x1f
+-neon_copy_32_u_loop:
+-	vld1.32		{q0, q1}, [r1]!
+-	lsls		r5, r8, #28
+-	bcc		neon_copy_32_u_b8
+-	bpl		neon_copy_32_u_b16
+-	vshr.u64	q2, q0, #40
+-	vshr.u64	q3, q1, #40
+-	b		neon_copy_32_unify
+-neon_copy_32_u_b8:
+-	vshr.u64	q2, q0, #56
+-	vshr.u64	q3, q1, #56
+-	b		neon_copy_32_unify
+-neon_copy_32_u_b16:
+-	vshr.u64	q2, q0, #48
+-	vshr.u64	q3, q1, #48
+-neon_copy_32_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vshl.u64	q0, q4, q0
+-	vshl.u64	q1, q4, q1
+-#else
+-	vshl.u64	q0, q0, q4
+-	vshl.u64	q1, q1, q4
+-#endif
+-	vmov		r5, s14
+-	vorr		d1, d1, d4
+-	vmov		s14, r4
+-	vorr		d2, d2, d5
+-	vorr		d3, d3, d6
+-	vorr		d0, d0, d7
+-	subs		r12, r12, #1
+-	pld		[r1, #0]
+-	mov		r4, r5
+-	vst1.32		{q0, q1}, [r0]!
+-	bne		neon_copy_32_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vldmia		sp!, {q4}
+-#else
+-	vpop		{q4}
+-#endif
+-neon_copy_finish_u:
+-neon_copy_16_u:
+-	movs		r12, r2, lsr #4
+-	beq		neon_copy_8_u
+-	vdup.u32	q2, r8
+-	and		r2, r2, #0xf
+-neon_copy_16_u_loop:
+-	vld1.32		{q0}, [r1]!
+-	lsls		r5, r8, #28
+-	bcc		neon_copy_16_u_b8
+-	bpl		neon_copy_16_u_b16
+-	vshr.u64	q1, q0, #40
+-	b		neon_copy_16_unify
+-neon_copy_16_u_b8:
+-	vshr.u64	q1, q0, #56
+-	b		neon_copy_16_unify
+-neon_copy_16_u_b16:
+-	vshr.u64	q1, q0, #48
+-neon_copy_16_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vshl.u64	q0, q2, q0
+-#else
+-	vshl.u64	q0, q0, q2
+-#endif
+-	vmov		r5, s6
+-	vorr		d1, d1, d2
+-	vmov		s6, r4
+-	vorr		d0, d0, d3
+-	subs		r12, r12, #1
+-	mov		r4, r5
+-	vst1.32		{q0}, [r0]!
+-	bne		neon_copy_16_u_loop
+-neon_copy_8_u:
+-	cmp		r2, #8
+-	blt		neon_copy_4_u
+-	ldm		r1!, {r6-r7}
+-	subs		r2, r2, #8
+-	orr		r4, r4, r6, lsl r8
+-	mov		r5, r6, lsr r9
+-	orr		r5, r5, r7, lsl r8
+-	stm		r0!, {r4-r5}
+-	mov		r4, r7, lsr r9
+-neon_copy_4_u:
+-	cmp		r2, #4
+-	blt		neon_copy_last_bits_u
+-	ldr		r5, [r1], #4
+-	subs		r2, r2, #4
+-	orr		r4, r4, r5, lsl r8
+-	str		r4, [r0], #4
+-	mov		r4, r5, lsr r9
+-neon_copy_last_bits_u:
+-	/*
+-	 * Remember, r8 contains the size of the data in r4 in bits,
+-	 * so to get to bytes we'll need to shift 3 places
+-	 */
+-	lsr		r8, r8, #0x3
+-	/* Write out the bytes stored in r4.*/
+-neon_copy_last_bits_u_loop:
+-	strb		r4, [r0], #1
+-	subs		r8, r8, #1
+-	lsrne		r4, r4, #8
+-	bne		neon_copy_last_bits_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	ldmia		sp!, {r6-r9}
+-#else
+-	pop		{r6-r9}
+-#endif
+-neon_copy_finish:
+-	cmp		r2, #0
+-	beq		neon_end
+-	/*
+-	 * This just copies the data from source to target one byte
+-	 * at a time.  For some small values, this makes more sense.
+-	 * Note that since this code copies data a byte at a time,
+-	 * both the aligned and unaligned paths can use it.
+-	 */
+-neon_copy_finish_loop:
+-	ldrb		r4, [r1], #1
+-	subs		r2, r2, #1
+-	strb		r4, [r0], #1
+-	bne		neon_copy_finish_loop
+-neon_end:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	ldmia		sp!, {r4-r5}
+-	ldmia		sp!, {r0}
+-#else
+-	pop		{r4-r5}
+-	pop		{r0}
+-#endif
+-	bx		lr
+-
+-	.endfunc
+-	.end
+diff --git git/src/neon_memmove.S git/src/neon_memmove.S
+deleted file mode 100644
+index 1bfe597..0000000
+--- git/src/neon_memmove.S
++++ /dev/null
+@@ -1,939 +0,0 @@
+-/***************************************************************************
+- Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+-
+- Redistribution and use in source and binary forms, with or without
+- modification, are permitted provided that the following conditions are met:
+-     * Redistributions of source code must retain the above copyright
+-       notice, this list of conditions and the following disclaimer.
+-     * Redistributions in binary form must reproduce the above copyright
+-       notice, this list of conditions and the following disclaimer in the
+-       documentation and/or other materials provided with the distribution.
+-     * Neither the name of Code Aurora nor the names of its contributors may
+-       be used to endorse or promote products derived from this software
+-       without specific prior written permission.
+-
+- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+- POSSIBILITY OF SUCH DAMAGE.
+-  ***************************************************************************/
+-
+-/***************************************************************************
+- *  Neon memmove: Attempts to do a memmove with Neon registers if possible,
+- *     Inputs:
+- *        dest: The destination buffer
+- *        src: The source buffer
+- *        n: The size of the buffer to transfer
+- *     Outputs:
+- *
+- ***************************************************************************/
+-
+-/*
+- * General note:
+- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
+- * However, it looks like the 2006 CodeSourcery Assembler has issues generating
+- * the correct object code for VPOP, resulting in horrific stack crashes.
+- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
+- * and VPOP->VLDMIA.  We can revert this back once we update our toolchain.
+- *
+- * Also, VSHL swaps the source register and the shift-amount register
+- * around in 2006-q3.  I've coded this incorrectly so it turns out correct
+- * in the object code, but we'll need to undo that later...
+- */
+-	.code 32
+-	.align 4
+-	.globl neon_memmove
+-	.func
+-
+-neon_memmove:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	stmdb		sp!, {r0}
+-#else
+-	push		{r0}
+-#endif
+-
+-	/*
+-	 * The requirements for memmove state that the function should
+-	 * operate as if data were being copied from the source to a
+-	 * buffer, then to the destination.  This is to allow a user
+-	 * to copy data from a source and target that overlap.
+-	 *
+-	 * We can't just do byte copies front-to-back automatically, since
+-	 * there's a good chance we may have an overlap (why else would someone
+-	 * intentionally use memmove then?).
+-	 *
+-	 * We'll break this into two parts.  Front-to-back, or back-to-front
+-	 * copies.
+-	 */
+-neon_memmove_cmf:
+-	cmp		r0, r1
+-	blt		neon_front_to_back_copy
+-	bgt		neon_back_to_front_copy
+-	b		neon_memmove_done
+-
+-	/* #############################################################
+-	 * Front to Back copy
+-	 */
+-neon_front_to_back_copy:
+-	/*
+-	 * For small copies, just do a quick memcpy.  We can do this for
+-	 * front-to-back copies, aligned or unaligned, since we're only
+-	 * doing 1 byte at a time...
+-	 */
+-	cmp		r2, #4
+-	bgt		neon_f2b_gt4
+-	cmp		r2, #0
+-neon_f2b_smallcopy_loop:
+-	beq		neon_memmove_done
+-	ldrb		r12, [r1], #1
+-	subs		r2, r2, #1
+-	strb		r12, [r0], #1
+-	b		neon_f2b_smallcopy_loop
+-neon_f2b_gt4:
+-	/* Preload what we can...*/
+-	pld		[r0,#0]
+-	pld		[r1,#0]	
+-	/* The window size is in r3. */
+-	sub		r3, r1, r0
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	stmdb		sp!, {r4-r6}
+-#else
+-	push		{r4-r6}
+-#endif
+-
+-neon_f2b_check_align:
+-	/* Check alignment. */
+-	ands		r12, r0, #0x3
+-	beq		neon_f2b_source_align_check
+-	cmp		r12, #2
+-	ldrb		r4, [r1], #1
+-	ldrleb		r5, [r1], #1
+-	ldrltb		r6, [r1], #1
+-	rsb		r12, r12, #4
+-	sub		r2, r2, r12
+-	strb		r4, [r0], #1
+-	strleb		r5, [r0], #1
+-	strltb		r6, [r0], #1
+-	
+-neon_f2b_source_align_check:
+-	ands		r12, r1, #0x3
+-	bne		neon_f2b_nonaligned
+-
+-neon_f2b_try_16_align:
+-	/* If we're >64, attempt to align on 16-bytes.  Smaller amounts
+-	 * don't seem to be worth handling. */
+-	cmp		r2, #64
+-	blt		neon_f2b_align_route
+-	/* This is where we try 16-byte alignment. */
+-	ands		r12, r0, #0xf
+-	beq		neon_f2b_align_route
+-	rsb		r12, r12, #16
+-neon_f2b_16_start:
+-	sub		r2, r2, r12
+-	lsrs		r5, r12, #2
+-neon_f2b_align_16_4:
+-	ldr		r4, [r1], #4
+-	subs		r5, r5, #1
+-	str		r4, [r0], #4
+-	bne		neon_f2b_align_16_4
+-neon_f2b_align_route:
+-	/* #############################################################
+-	 * Front to Back copy - aligned
+-	 */
+-	/*
+-	 * Note that we can't just route based on the size in r2.  If that's
+-	 * larger than the overlap window in r3, we could potentially
+-	 * (and likely!) destroy data we're copying.
+-	 */
+-	cmp		r2, r3
+-	movle		r12, r2
+-	movgt		r12, r3
+-	cmp		r12, #256
+-	bge		neon_f2b_copy_128_a
+-	cmp		r12, #64
+-	bge		neon_f2b_copy_32_a
+-	cmp		r12, #16
+-	bge		neon_f2b_copy_16_a
+-	cmp		r12, #8
+-	bge		neon_f2b_copy_8_a
+-	cmp		r12, #4
+-	bge		neon_f2b_copy_4_a
+-	b		neon_f2b_copy_1_a
+-neon_f2b_copy_128_a:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vstmdb		sp!, {q4-q7}
+-#else
+-	vpush		{q4-q7}
+-#endif
+-	mov		r12, r2, lsr #7
+-neon_f2b_copy_128_a_loop:
+-	vld1.32		{q0,q1}, [r1]!
+-	vld1.32		{q2,q3}, [r1]!
+-	vld1.32		{q4,q5}, [r1]!
+-	vld1.32		{q6,q7}, [r1]!
+-	pld		[r1, #0]
+-	pld		[r1, #128]
+-	vst1.32		{q0,q1}, [r0]!
+-	vst1.32		{q2,q3}, [r0]!
+-	vst1.32		{q4,q5}, [r0]!
+-	vst1.32		{q6,q7}, [r0]!
+-	subs		r12, r12, #1
+-	pld		[r0, #0]
+-	pld		[r0, #128]
+-	bne		neon_f2b_copy_128_a_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vldmia		sp!, {q4-q7}
+-#else
+-	vpop		{q4-q7}
+-#endif
+-	ands		r2, r2, #0x7f
+-	beq		neon_f2b_finish
+-	cmp		r2, #32
+-	bge		neon_f2b_copy_32_a
+-	b		neon_f2b_copy_finish_a
+-neon_f2b_copy_32_a:
+-	mov		r12, r2, lsr #5
+-neon_f2b_copy_32_a_loop:
+-	vld1.32		{q0,q1}, [r1]!
+-	subs		r12, r12, #1
+-	pld		[r1, #0]
+-	vst1.32		{q0,q1}, [r0]!
+-	bne		neon_f2b_copy_32_a_loop
+-	ands		r2, r2, #0x1f
+-	beq		neon_f2b_finish
+-neon_f2b_copy_finish_a:
+-neon_f2b_copy_16_a:
+-	movs		r12, r2, lsr #4
+-	beq		neon_f2b_copy_8_a
+-neon_f2b_copy_16_a_loop:
+-	vld1.32		{q0}, [r1]!
+-	subs		r12, r12, #1
+-	vst1.32		{q0}, [r0]!
+-	bne		neon_f2b_copy_16_a_loop
+-	ands		r2, r2, #0xf
+-	beq		neon_f2b_finish
+-neon_f2b_copy_8_a:
+-	cmp		r2, #8
+-	blt		neon_f2b_copy_4_a
+-	ldm		r1!, {r4-r5}
+-	subs		r2, r2, #8
+-	stm		r0!, {r4-r5}
+-neon_f2b_copy_4_a:
+-	cmp		r2, #4
+-	blt		neon_f2b_copy_1_a
+-	ldr		r4, [r1], #4
+-	subs		r2, r2, #4
+-	str		r4, [r0], #4
+-neon_f2b_copy_1_a:
+-	cmp		r2, #0
+-	beq		neon_f2b_finish
+-neon_f2b_copy_1_a_loop:
+-	ldrb		r12, [r1], #1
+-	subs		r2, r2, #1
+-	strb		r12, [r0], #1
+-	bne		neon_f2b_copy_1_a_loop
+-	b		neon_f2b_finish
+-		
+-	/* #############################################################
+-	 * Front to Back copy - unaligned
+-	 */
+-neon_f2b_nonaligned:
+-	/*
+-	 * For sizes < 8, does it really make sense to do the whole shift
+-	 * party?  Note that we DON'T want to call neon_f2b_copy_1_u,
+-	 * since we'll end up trying to pop r8-r11, and we DON'T want
+-	 * to do that...
+-	 */
+-	cmp		r2, #8
+-	ble		neon_f2b_copy_1_a
+-
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	stmdb		sp!, {r7-r9}
+-#else
+-	push		{r7-r9}
+-#endif
+-	cmp		r12, #2
+-	ldrb		r4, [r1], #1
+-	ldrleb		r5, [r1], #1
+-	ldrltb		r6, [r1], #1
+-	rsb		r8, r12, #4
+-	sub		r2, r2, r8
+-	lsl		r8, r8, #3
+-	orrle		r4, r4, r5, lsl #8
+-	orrlt		r4, r4, r6, lsl #16
+-	rsb		r9, r8, #32
+-	/*
+-	 * r4  = overflow bits
+-	 * r8 = # of bits we copied into the r4 register to align source.
+-	 * r9 = 32 - r8
+-	 * r12 = Index counter for each size, so we determine how many times
+-	 *       the given size will go into r2, then count down that # of
+-	 *       times in r12.
+-	 */
+-	cmp		r2, #64
+-	blt		neon_f2b_unaligned_route
+-	ands		r12, r0, #0xf
+-	beq		neon_f2b_unaligned_route
+-	cmp		r3, #4
+-	blt		neon_f2b_unaligned_route
+-	rsb		r12, r12, #16
+-neon_f2b_16_start_u:
+-	sub		r2, r2, r12
+-	lsrs		r6, r12, #2
+-neon_f2b_align_16_4_u:
+-	ldr		r5, [r1], #4
+-	subs		r6, r6, #1
+-	orr		r4, r4, r5, lsl r8
+-	str		r4, [r0], #4
+-	mov		r4, r5, lsr r9
+-	bne		neon_f2b_align_16_4_u
+-neon_f2b_unaligned_route:
+-	cmp		r2, r3
+-	movle		r12, r2
+-	movgt		r12, r3
+-	cmp		r12, #256
+-	bge		neon_f2b_copy_64_u
+-	cmp		r12, #64
+-	bge		neon_f2b_copy_32_u
+-	cmp		r12, #16
+-	bge		neon_f2b_copy_16_u
+-	cmp		r12, #8
+-	bge		neon_f2b_copy_8_u
+-	cmp		r12, #4
+-	bge		neon_f2b_copy_4_u
+-	b		neon_f2b_last_bits_u
+-neon_f2b_copy_64_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vstmdb		sp!, {q4}
+-	vstmdb		sp!, {q5-q8}
+-#else
+-	vpush		{q4}
+-	vpush		{q5-q8}
+-#endif
+-	vdup.u32	q8, r8
+-	mov		r12, r2, lsr #6
+-	and		r2, r2, #0x3f
+-neon_f2b_copy_64_u_loop:
+-	vld1.32		{q4, q5}, [r1]!
+-	vld1.32		{q6, q7}, [r1]!
+-	lsls		r5, r8, #28
+-	bcc		neon_f2b_copy_64_u_b8
+-	bpl		neon_f2b_copy_64_u_b16
+-	vshr.u64	q0, q4, #40
+-	vshr.u64	q1, q5, #40
+-	vshr.u64	q2, q6, #40
+-	vshr.u64	q3, q7, #40
+-	b		neon_f2b_copy_64_unify
+-neon_f2b_copy_64_u_b8:
+-	vshr.u64	q0, q4, #56
+-	vshr.u64	q1, q5, #56
+-	vshr.u64	q2, q6, #56
+-	vshr.u64	q3, q7, #56
+-	b		neon_f2b_copy_64_unify
+-neon_f2b_copy_64_u_b16:
+-	vshr.u64	q0, q4, #48
+-	vshr.u64	q1, q5, #48
+-	vshr.u64	q2, q6, #48
+-	vshr.u64	q3, q7, #48
+-neon_f2b_copy_64_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vshl.u64	q4, q8, q4
+-	vshl.u64	q5, q8, q5
+-	vshl.u64	q6, q8, q6
+-	vshl.u64	q7, q8, q7
+-#else
+-	vshl.u64	q4, q4, q8
+-	vshl.u64	q5, q5, q8
+-	vshl.u64	q6, q6, q8
+-	vshl.u64	q7, q7, q8
+-#endif
+-	vmov		r5, s14
+-	vorr		d9, d9, d0
+-	vmov		s14, r4
+-	vorr		d10, d10, d1
+-	vorr		d11, d11, d2
+-	vorr		d12, d12, d3
+-	vorr		d13, d13, d4
+-	vorr		d14, d14, d5
+-	vorr		d15, d15, d6
+-	vorr		d8, d8, d7
+-	subs		r12, r12, #1
+-	pld		[r1, #0]
+-	pld		[r1, #128]
+-	mov		r4, r5
+-	vst1.32		{q4, q5}, [r0]!
+-	vst1.32		{q6, q7}, [r0]!
+-	pld		[r0, #0]
+-	pld		[r0, #128]
+-	bne		neon_f2b_copy_64_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vldmia		sp!, {q5-q8}
+-	vldmia		sp!, {q4}
+-#else
+-	vpop		{q5-q8}
+-	vpop		{q4}
+-#endif
+-	cmp		r2, #32
+-	bge		neon_f2b_copy_32_u
+-	b		neon_f2b_copy_finish_u
+-neon_f2b_copy_32_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vstmdb		sp!, {q4}
+-#else
+-	vpush		{q4}
+-#endif
+-	vdup.u32	q4, r8
+-	mov		r12, r2, lsr #5
+-	and		r2, r2, #0x1f
+-neon_f2b_copy_32_u_loop:
+-	vld1.32		{q0, q1}, [r1]!
+-	lsls		r5, r8, #28
+-	bcc		neon_f2b_copy_32_u_b8
+-	bpl		neon_f2b_copy_32_u_b16
+-	vshr.u64	q2, q0, #40
+-	vshr.u64	q3, q1, #40
+-	b		neon_f2b_copy_32_unify
+-neon_f2b_copy_32_u_b8:
+-	vshr.u64	q2, q0, #56
+-	vshr.u64	q3, q1, #56
+-	b		neon_f2b_copy_32_unify
+-neon_f2b_copy_32_u_b16:
+-	vshr.u64	q2, q0, #48
+-	vshr.u64	q3, q1, #48
+-neon_f2b_copy_32_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vshl.u64	q0, q4, q0
+-	vshl.u64	q1, q4, q1
+-#else
+-	vshl.u64	q0, q0, q4
+-	vshl.u64	q1, q1, q4
+-#endif
+-	vmov		r5, s14
+-	vorr		d1, d1, d4
+-	vmov		s14, r4
+-	vorr		d2, d2, d5
+-	vorr		d3, d3, d6
+-	vorr		d0, d0, d7
+-	subs		r12, r12, #1
+-	pld		[r1, #0]
+-	mov		r4, r5
+-	vst1.32		{q0, q1}, [r0]!
+-	bne		neon_f2b_copy_32_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vldmia		sp!, {q4}
+-#else
+-	vpop		{q4}
+-#endif
+-neon_f2b_copy_finish_u:
+-neon_f2b_copy_16_u:
+-	movs		r12, r2, lsr #4
+-	beq		neon_f2b_copy_8_u
+-	vdup.u32	q2, r8
+-	and		r2, r2, #0xf
+-neon_f2b_copy_16_u_loop:
+-	vld1.32		{q0}, [r1]!
+-	lsls		r5, r8, #28
+-	bcc		neon_f2b_copy_16_u_b8
+-	bpl		neon_f2b_copy_16_u_b16
+-	vshr.u64	q1, q0, #40
+-	b		neon_f2b_copy_16_unify
+-neon_f2b_copy_16_u_b8:
+-	vshr.u64	q1, q0, #56
+-	b		neon_f2b_copy_16_unify
+-neon_f2b_copy_16_u_b16:
+-	vshr.u64	q1, q0, #48
+-neon_f2b_copy_16_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vshl.u64	q0, q2, q0
+-#else
+-	vshl.u64	q0, q0, q2
+-#endif
+-	vmov		r5, s6
+-	vorr		d1, d1, d2
+-	vmov		s6, r4
+-	vorr		d0, d0, d3
+-	subs		r12, r12, #1
+-	mov		r4, r5
+-	vst1.32		{q0}, [r0]!
+-	bne		neon_f2b_copy_16_u_loop
+-neon_f2b_copy_8_u:
+-	cmp		r2, #8
+-	blt		neon_f2b_copy_4_u
+-	ldm		r1!, {r6-r7}
+-	subs		r2, r2, #8
+-	orr		r4, r4, r6, lsl r8
+-	mov		r5, r6, lsr r9
+-	orr		r5, r5, r7, lsl r8
+-	stm		r0!, {r4-r5}
+-	mov		r4, r7, lsr r9
+-neon_f2b_copy_4_u:
+-	cmp		r2, #4
+-	blt		neon_f2b_last_bits_u
+-	ldr		r5, [r1], #4
+-	subs		r2, r2, #4
+-	orr		r4, r4, r5, lsl r8
+-	str		r4, [r0], #4
+-	mov		r4, r5, lsr r9
+-neon_f2b_last_bits_u:
+-	lsr		r8, r8, #0x3
+-neon_f2b_last_bits_u_loop:
+-	strb		r4, [r0], #1
+-	subs		r8, r8, #1
+-	lsr		r4, r4, #8
+-	bne		neon_f2b_last_bits_u_loop
+-neon_f2b_copy_1_u:
+-	cmp		r2, #0
+-	beq		neon_f2b_finish_u
+-neon_f2b_copy_1_u_loop:
+-	ldrb		r12, [r1], #1
+-	subs		r2, r2, #1
+-	strb		r12, [r0], #1
+-	bne		neon_f2b_copy_1_u_loop
+-neon_f2b_finish_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	ldmia		sp!, {r7-r9}
+-#else
+-	pop		{r7-r9}
+-#endif
+-	/* #############################################################
+-	 * Front to Back copy - finish
+-	 */
+-neon_f2b_finish:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	ldmia		sp!, {r4-r6}
+-#else
+-	pop		{r4-r6}
+-#endif
+-	b		neon_memmove_done
+-
+-	/* #############################################################
+-	 * Back to Front copy
+-	 */
+-neon_back_to_front_copy:
+-	/*
+-	 * Here, we'll want to shift to the end of the buffers.  This
+-	 * actually points us one past where we need to go, but since
+-	 * we'll pre-decrement throughout, this will be fine.
+-	 */
+-	add		r0, r0, r2
+-	add		r1, r1, r2
+-	cmp		r2, #4
+-	bgt		neon_b2f_gt4
+-	cmp		r2, #0
+-neon_b2f_smallcopy_loop:
+-	beq		neon_memmove_done
+-	ldrb		r12, [r1, #-1]!
+-	subs		r2, r2, #1
+-	strb		r12, [r0, #-1]!
+-	b		neon_b2f_smallcopy_loop
+-neon_b2f_gt4:
+-	pld		[r0, #0]
+-	pld		[r1, #0]
+-	/*
+-	 * The minimum of the overlap window size and the copy size
+-	 * is in r3.
+-	 */
+-	sub		r3, r0, r1
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	stmdb		sp!, {r4-r5}
+-#else
+-	push		{r4-r5}
+-#endif
+-
+-	/*
+-	 * Check alignment.  Since we'll pre-decrement as we step thru, we'll
+-	 * need to make sure we're on word-alignment.
+-	 */
+-neon_b2f_check_align:
+-	ands		r12, r0, #0x3
+-	beq		neon_b2f_source_align_check
+-	sub		r2, r2, r12
+-neon_b2f_shift_align:
+-	ldrb		r4, [r1, #-1]!
+-	subs		r12, r12, #1
+-	strb		r4, [r0, #-1]!
+-	bne		neon_b2f_shift_align
+-neon_b2f_source_align_check:
+-	ands		r4, r1, #0x3
+-	bne		neon_b2f_nonaligned
+-	
+-neon_b2f_try_16_align:
+-	/* If we're >64, attempt to align on 16-bytes.  Smaller amounts
+-	 * don't seem to be worth handling. */
+-	cmp		r2, #64
+-	blt		neon_b2f_align_route
+-	ands		r12, r0, #0xf
+-	beq		neon_b2f_align_route
+-	/* In this case, r12 has the number of bytes to roll backward. */
+-neon_b2f_16_start:
+-	sub		r2, r2, r12
+-	lsrs		r5, r12, #2
+-neon_b2f_align_16_4:
+-	ldr		r4, [r1, #-4]!
+-	subs		r5, r5, #1
+-	str		r4, [r0, #-4]!
+-	bne		neon_b2f_align_16_4
+-neon_b2f_align_route:
+-	/*
+-	 * #############################################################
+-	 * Back to Front copy - aligned
+-	 */
+-	cmp		r2, r3
+-	movle		r12, r2
+-	movgt		r12, r3
+-	cmp		r12, #256
+-	bge		neon_b2f_copy_128_a
+-	cmp		r12, #64
+-	bge		neon_b2f_copy_32_a
+-	cmp		r12, #8
+-	bge		neon_b2f_copy_8_a
+-	cmp		r12, #4
+-	bge		neon_b2f_copy_4_a
+-	b		neon_b2f_copy_1_a
+-neon_b2f_copy_128_a:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vstmdb		sp!, {q4-q7}
+-#else
+-	vpush		{q4-q7}
+-#endif
+-	movs		r12, r2, lsr #7
+-	/*
+-	 * This irks me.  There MUST be a better way to read these in and
+-	 * scan the register backward instead of making it go forward.  Then
+-	 * we need to do two subtractions...
+-	 */
+-neon_b2f_copy_128_a_loop:
+-	sub		r1, r1, #128
+-	sub		r0, r0, #128
+-	vld1.32		{q0, q1}, [r1]!
+-	vld1.32		{q2, q3}, [r1]!
+-	vld1.32		{q4, q5}, [r1]!
+-	vld1.32		{q6, q7}, [r1]!
+-	pld		[r1, #-128]
+-	pld		[r1, #-256]
+-	vst1.32		{q0, q1}, [r0]!
+-	vst1.32		{q2, q3}, [r0]!
+-	vst1.32		{q4, q5}, [r0]!
+-	vst1.32		{q6, q7}, [r0]!
+-	subs		r12, r12, #1
+-	pld		[r0, #-128]
+-	pld		[r0, #-256]
+-	sub		r1, r1, #128
+-	sub		r0, r0, #128
+-	bne		neon_b2f_copy_128_a_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vldmia		sp!, {q4-q7}
+-#else
+-	vpop		{q4-q7}
+-#endif
+-	ands		r2, r2, #0x7f
+-	beq		neon_b2f_finish
+-	cmp		r2, #32
+-	bge		neon_b2f_copy_32_a
+-	b		neon_b2f_copy_finish_a
+-neon_b2f_copy_32_a:
+-	mov		r12, r2, lsr #5
+-neon_b2f_copy_32_a_loop:
+-	sub		r1, r1, #32
+-	sub		r0, r0, #32
+-	vld1.32		{q0,q1}, [r1]
+-	subs		r12, r12, #1
+-	vst1.32		{q0,q1}, [r0]
+-	pld		[r1, #0]
+-	bne		neon_b2f_copy_32_a_loop
+-	ands		r2, r2, #0x1f
+-	beq		neon_b2f_finish
+-neon_b2f_copy_finish_a:
+-neon_b2f_copy_8_a:
+-	movs		r12, r2, lsr #0x3
+-	beq		neon_b2f_copy_4_a
+-neon_b2f_copy_8_a_loop:
+-	ldmdb		r1!, {r4-r5}
+-	subs		r12, r12, #1
+-	stmdb		r0!, {r4-r5}
+-	bne		neon_b2f_copy_8_a_loop
+-	and		r2, r2, #0x7
+-neon_b2f_copy_4_a:
+-	movs		r12, r2, lsr #0x2
+-	beq		neon_b2f_copy_1_a
+-	and		r2, r2, #0x3
+-neon_b2f_copy_4_a_loop:
+-	ldr		r4, [r1, #-4]!
+-	subs		r12, r12, #1
+-	str		r4, [r0, #-4]!
+-	bne		neon_b2f_copy_4_a_loop
+-neon_b2f_copy_1_a:
+-	cmp		r2, #0
+-	beq		neon_b2f_finish
+-neon_b2f_copy_1_a_loop:
+-	ldrb		r12, [r1, #-1]!
+-	subs		r2, r2, #1
+-	strb		r12, [r0, #-1]!
+-	bne		neon_b2f_copy_1_a_loop
+-
+-	/* #############################################################
+-	 * Back to Front copy - unaligned
+-	 */
+-neon_b2f_nonaligned:
+-	/*
+-	 * For sizes < 8, does it really make sense to do the whole shift
+-	 * party?
+-	 */
+-	cmp		r2, #8
+-	ble		neon_b2f_copy_1_a
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	stmdb		sp!, {r6-r11}
+-#else
+-	push		{r6-r11}
+-#endif
+-	/*
+-	 * r3 = max window size
+-	 * r4 = overflow bytes
+-	 * r5 = bytes we're reading into
+-	 * r6 = # bytes we're off.
+-	 * r10 = copy of r6
+-	 */
+-	and		r6, r1, #0x3
+-	eor		r4, r4, r4
+-	mov		r10, r6
+-neon_b2f_realign:
+-	ldrb		r5, [r1, #-1]!
+-	subs		r6, r6, #1
+-	orr		r4, r5, r4, lsl #8
+-	bne		neon_b2f_realign
+-	/*
+-	 * r10 = # of bits we copied into the r4 register to align source.
+-	 * r11 = 32 - r10
+-	 * r12 = Index counter for each size, so we determine how many times
+-	 *       the given size will go into r2, then count down that # of
+-	 *       times in r12.
+-	 */
+-	sub		r2, r2, r10
+-	lsl		r10, r10, #0x3
+-	rsb		r11, r10, #32
+-
+-	cmp		r2, r3
+-	movle		r12, r2
+-	movgt		r12, r3
+-	cmp		r12, #256
+-	bge		neon_b2f_copy_64_u
+-	cmp		r12, #64
+-	bge		neon_b2f_copy_32_u
+-	cmp		r12, #8
+-	bge		neon_b2f_copy_8_u
+-	cmp		r12, #4
+-	bge		neon_b2f_copy_4_u
+-	b		neon_b2f_last_bits_u
+-neon_b2f_copy_64_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vstmdb		sp!, {q4,q5}
+-	vstmdb		sp!, {q6-q8}
+-#else
+-	vpush		{q4,q5}
+-	vpush		{q6-q8}
+-#endif
+-	add		r7, r11, #32
+-	movs		r12, r2, lsr #6
+-	vdup.u32	q8, r7
+-neon_b2f_copy_64_u_loop:
+-	sub		r1, r1, #64
+-	sub		r0, r0, #64
+-	vld1.32		{q0, q1}, [r1]!
+-	vld1.32		{q2, q3}, [r1]
+-	sub		r1, r1, #32
+-	vmov		q4, q0
+-	vmov		q5, q1
+-	vmov		q6, q2
+-	vmov		q7, q3
+-	vmov		r5, s0
+-	mov		r4, r4, lsl r11
+-	lsls		r6, r10, #28
+-	bcc		neon_b2f_copy_64_u_b8
+-	bpl		neon_b2f_copy_64_u_b16
+-	vshr.u64	q0, q0, #24
+-	vshr.u64	q1, q1, #24
+-	vshr.u64	q2, q2, #24
+-	vshr.u64	q3, q3, #24
+-	b		neon_b2f_copy_64_unify
+-neon_b2f_copy_64_u_b8:
+-	vshr.u64	q0, q0, #8
+-	vshr.u64	q1, q1, #8
+-	vshr.u64	q2, q2, #8
+-	vshr.u64	q3, q3, #8
+-	b		neon_b2f_copy_64_unify
+-neon_b2f_copy_64_u_b16:
+-	vshr.u64	q0, q0, #16
+-	vshr.u64	q1, q1, #16
+-	vshr.u64	q2, q2, #16
+-	vshr.u64	q3, q3, #16
+-neon_b2f_copy_64_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vshl.u64	q4, q8, q4
+-	vshl.u64	q5, q8, q5
+-	vshl.u64	q6, q8, q6
+-	vshl.u64	q7, q8, q7
+-#else
+-	vshl.u64	q4, q4, q8
+-	vshl.u64	q5, q5, q8
+-	vshl.u64	q6, q6, q8
+-	vshl.u64	q7, q7, q8
+-#endif
+-	vmov		s17, r4
+-	vorr		d7, d7, d8
+-	vorr		d6, d6, d15
+-	vorr		d5, d5, d14
+-	vorr		d4, d4, d13
+-	vorr		d3, d3, d12
+-	vorr		d2, d2, d11
+-	vorr		d1, d1, d10
+-	vorr		d0, d0, d9
+-	mov		r4, r5, lsl r11
+-	subs		r12, r12, #1
+-	lsr		r4, r4, r11
+-	vst1.32		{q0, q1}, [r0]!
+-	vst1.32		{q2, q3}, [r0]
+-	pld		[r1, #0]
+-	sub		r0, r0, #32
+-	bne		neon_b2f_copy_64_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vldmia		sp!, {q6-q8}
+-	vldmia		sp!, {q4,q5}
+-#else
+-	vpop		{q6-q8}
+-	vpop		{q4,q5}
+-#endif
+-	ands		r2, r2, #0x3f
+-	cmp		r2, #32
+-	bge		neon_b2f_copy_32_u
+-	b		neon_b2f_copy_finish_u
+-neon_b2f_copy_32_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vstmdb		sp!, {q4}
+-#else
+-	vpush		{q4}
+-#endif
+-	add		r7, r11, #32
+-	movs		r12, r2, lsr #5
+-	vdup.u32	q4, r7
+-	and		r2, r2, #0x1f
+-neon_b2f_copy_32_u_loop:
+-	sub		r1, r1, #32
+-	sub		r0, r0, #32
+-	vld1.32		{q0, q1}, [r1]
+-	vmov		q2, q0
+-	vmov		q3, q1
+-	vmov		r5, s0
+-	mov		r4, r4, lsl r11
+-	lsls		r6, r10, #28
+-	bcc		neon_b2f_copy_32_u_b8
+-	bpl		neon_b2f_copy_32_u_b16
+-	vshr.u64	q0, q0, #24
+-	vshr.u64	q1, q1, #24
+-	b		neon_b2f_copy_32_unify
+-neon_b2f_copy_32_u_b8:
+-	vshr.u64	q0, q0, #8
+-	vshr.u64	q1, q1, #8
+-	b		neon_b2f_copy_32_unify
+-neon_b2f_copy_32_u_b16:
+-	vshr.u64	q0, q0, #16
+-	vshr.u64	q1, q1, #16
+-neon_b2f_copy_32_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vshl.u64	q2, q4, q2
+-	vshl.u64	q3, q4, q3
+-#else
+-	vshl.u64	q2, q2, q4
+-	vshl.u64	q3, q3, q4
+-#endif
+-	vmov		s9, r4
+-	vorr		d3, d3, d4
+-	vorr		d2, d2, d7
+-	vorr		d1, d1, d6
+-	vorr		d0, d0, d5
+-	mov		r4, r5, lsl r11
+-	subs		r12, r12, #1
+-	lsr		r4, r4, r11
+-	vst1.32		{q0, q1}, [r0]
+-	pld		[r1, #0]
+-	bne		neon_b2f_copy_32_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	vldmia		sp!, {q4}
+-#else
+-	vpop		{q4}
+-#endif
+-neon_b2f_copy_finish_u:
+-neon_b2f_copy_8_u:
+-	movs		r12, r2, lsr #0x3
+-	beq		neon_b2f_copy_4_u
+-	mov		r5, r4, lsl r11
+-neon_b2f_copy_8_u_loop:
+-	ldmdb		r1!, {r6-r7}
+-	subs		r12, r12, #1
+-	orr		r5, r5, r7, lsr r10
+-	mov		r4, r7, lsl r11
+-	orr		r4, r4, r6, lsr r10
+-	stmdb		r0!, {r4-r5}
+-	mov		r4, r6, lsl r11
+-	lsr		r4, r4, r11
+-	mov		r5, r4, lsl r11
+-	bne		neon_b2f_copy_8_u_loop
+-	ands		r2, r2, #0x7
+-neon_b2f_copy_4_u:
+-	movs		r12, r2, lsr #0x2
+-	beq		neon_b2f_last_bits_u
+-	mov		r5, r4, lsl r11
+-neon_b2f_copy_4_u_loop:
+-	ldr		r6, [r1, #-4]!
+-	subs		r12, r12, #1
+-	orr		r5, r5, r6, lsr r10
+-	str		r5, [r0, #-4]!
+-	mov		r4, r6, lsl r11
+-	lsr		r4, r4, r11
+-	mov		r5, r4, lsl r11
+-	bne		neon_b2f_copy_4_u_loop
+-	and		r2, r2, #0x3
+-neon_b2f_last_bits_u:
+-neon_b2f_last_bits_u_loop:
+-	subs		r10, r10, #8
+-	mov		r5, r4, lsr r10
+-	strb		r5, [r0, #-1]!
+-	bne		neon_b2f_last_bits_u_loop
+-neon_b2f_copy_1_u:
+-	cmp		r2, #0
+-	beq		neon_b2f_finish_u
+-neon_b2f_copy_1_u_loop:
+-	ldrb		r12, [r1, #-1]!
+-	subs		r2, r2, #1
+-	strb		r12, [r0, #-1]!
+-	bne		neon_b2f_copy_1_u_loop
+-neon_b2f_finish_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	ldmia		sp!, {r6-r11}
+-#else
+-	pop		{r6-r11}
+-#endif
+-
+-neon_b2f_finish:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	ldmia		sp!, {r4-r5}
+-#else
+-	pop		{r4-r5}
+-#endif
+-
+-neon_memmove_done:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-	ldmia		sp!, {r0}
+-#else
+-	pop		{r0}
+-#endif
+-	bx		lr
+-
+-	.endfunc
+-	.end
+diff --git git/src/neon_memsets.c git/src/neon_memsets.c
+deleted file mode 100755
+index 740fc1e..0000000
+--- git/src/neon_memsets.c
++++ /dev/null
+@@ -1,169 +0,0 @@
+-/* neon_memsets.c
+- *
+- * Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+- *
+- * Redistribution and use in source and binary forms, with or without
+- * modification, are permitted provided that the following conditions are met:
+- *     * Redistributions of source code must retain the above copyright
+- *       notice, this list of conditions and the following disclaimer.
+- *     * Redistributions in binary form must reproduce the above copyright
+- *       notice, this list of conditions and the following disclaimer in the
+- *       documentation and/or other materials provided with the distribution.
+- *     * Neither the name of Code Aurora nor
+- *       the names of its contributors may be used to endorse or promote
+- *       products derived from this software without specific prior written
+- *       permission.
+- *
+- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+- * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+- * NON-INFRINGEMENT ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+- */
+-
+-#include "msm-swblits.h"
+-
+-void memset16(uint16_t dst[], uint16_t value, int count)
+-{
+-    if (count <= 0)
+-        return;
+-
+-    asm volatile(
+-                 "       pld        [%[dst], #0]                           \n"
+-                 "       cmp        %[count], #4                           \n"
+-                 "       blt        6f                                     \n"
+-                 "       tst        %[dst], #0x3                           \n"
+-                 "       strneh     %[value], [%[dst]], #2                 \n"
+-                 "       subne      %[count], %[count], #1                 \n"
+-                 "       vdup.u16   q8, %[value]                           \n"
+-                 "       vmov       q9, q8                                 \n"
+-                 "       cmp        %[count], #64                          \n"
+-                 "       bge        0f                                     \n"
+-                 "       cmp        %[count], #32                          \n"
+-                 "       bge        2f                                     \n"
+-                 "       cmp        %[count], #16                          \n"
+-                 "       bge        3f                                     \n"
+-                 "       cmp        %[count], #8                           \n"
+-                 "       bge        4f                                     \n"
+-                 "       b          5f                                     \n"
+-                 "0:                                                       \n"
+-                 "       mov        r12, %[count], lsr #6                  \n"
+-                 "1:                                                       \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       subs       r12, r12, #1                           \n"
+-                 "       bne        1b                                     \n"
+-                 "       ands       %[count], %[count], #0x3f              \n"
+-                 "       beq        7f                                     \n"
+-                 "2:                                                       \n"
+-                 "       cmp        %[count], #32                          \n"
+-                 "       blt        3f                                     \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       subs       %[count], %[count], #32                \n"
+-                 "       beq        7f                                     \n"
+-                 "3:                                                       \n"
+-                 "       cmp        %[count], #16                          \n"
+-                 "       blt        4f                                     \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       subs       %[count], %[count], #16                \n"
+-                 "       beq        7f                                     \n"
+-                 "4:                                                       \n"
+-                 "       cmp        %[count], #8                           \n"
+-                 "       blt        5f                                     \n"
+-                 "       vst1.16    {q8}, [%[dst]]!                        \n"
+-                 "       subs       %[count], %[count], #8                 \n"
+-                 "       beq        7f                                     \n"
+-                 "5:                                                       \n"
+-                 "       cmp        %[count], #4                           \n"
+-                 "       blt        6f                                     \n"
+-                 "       vst1.16    {d16}, [%[dst]]!                       \n"
+-                 "       subs       %[count], %[count], #4                 \n"
+-                 "       beq        7f                                     \n"
+-                 "6:                                                       \n"
+-                 "       cmp        %[count], #0                           \n"
+-                 "       blt        7f                                     \n"
+-                 "       lsls       %[count], #31                          \n"
+-                 "       strmih     %[value], [%[dst]], #2                 \n"
+-                 "       strcsh     %[value], [%[dst]], #2                 \n"
+-                 "       strcsh     %[value], [%[dst]], #2                 \n"
+-                 "7:                                                       \n"
+-                 // Clobbered input registers
+-                 : [dst] "+r" (dst), [count] "+r" (count)
+-                 // Unclobbered input
+-                 : [value] "r" (value)
+-                 // Clobbered registers
+-                 : "q8", "q9", "r12", "cc", "memory"
+-                 );
+-}
+-
+-void memset32(uint32_t dst[], uint32_t value, int count)
+-{
+-    asm volatile(
+-                 "       pld        [%[dst], #0]                           \n"
+-                 "       cmp        %[count], #4                           \n"
+-                 "       blt        5f                                     \n"
+-                 "       vdup.u32   q8, %[value]                           \n"
+-                 "       vmov       q9, q8                                 \n"
+-                 "       cmp        %[count], #32                          \n"
+-                 "       bge        0f                                     \n"
+-                 "       cmp        %[count], #16                          \n"
+-                 "       bge        2f                                     \n"
+-                 "       cmp        %[count], #8                           \n"
+-                 "       bge        3f                                     \n"
+-                 "       b          4f                                     \n"
+-                 "0:                                                       \n"
+-                 "       mov        r12, %[count], lsr #5                  \n"
+-                 "1:                                                       \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       pld        [%[dst], #0]                           \n"
+-                 "       subs       r12, r12, #1                           \n"
+-                 "       bne        1b                                     \n"
+-                 "       ands       %[count], %[count], #0x1f              \n"
+-                 "       beq        6f                                     \n"
+-                 "2:                                                       \n"
+-                 "       cmp        %[count], #16                          \n"
+-                 "       blt        3f                                     \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       subs       %[count], %[count], #16                \n"
+-                 "       beq        6f                                     \n"
+-                 "3:                                                       \n"
+-                 "       cmp        %[count], #8                           \n"
+-                 "       blt        4f                                     \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       subs       %[count], %[count], #8                 \n"
+-                 "       beq        6f                                     \n"
+-                 "4:                                                       \n"
+-                 "       cmp        %[count], #4                           \n"
+-                 "       blt        5f                                     \n"
+-                 "       vst1.32    {q8}, [%[dst]]!                        \n"
+-                 "       subs       %[count], %[count], #4                 \n"
+-                 "       beq        6f                                     \n"
+-                 "5:                                                       \n"
+-                 "       cmp        %[count], #0                           \n"
+-                 "       beq        6f                                     \n"
+-                 "       lsls       %[count], #31                          \n"
+-                 "       strmi      %[value], [%[dst]], #4                 \n"
+-                 "       strcs      %[value], [%[dst]], #4                 \n"
+-                 "       strcs      %[value], [%[dst]], #4                 \n"
+-                 "6: @end                                                  \n"
+-                 // Clobbered input registers
+-                 : [dst] "+r" (dst), [count] "+r" (count)
+-                 // Unclobbered input
+-                 : [value] "r" (value)
+-                 // Clobbered registers
+-                 : "q8", "q9", "r12", "cc", "memory"
+-                 );
+-}
diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch b/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch
new file mode 100644
index 0000000000..97ad380e27
--- /dev/null
+++ b/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch
@@ -0,0 +1,36 @@
+commit 18515a56822fcd9c0a71240edce97ea5623b0448
+Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
+Date:   Wed Feb 10 16:29:55 2010 +0100
+
+    Modify Makefile.am
+    Removed depencies for neon
+
+diff --git git/src/Makefile.am git/src/Makefile.am
+index 8ab1856..08da5a5 100755
+--- a/src/Makefile.am
++++ b/src/Makefile.am
+@@ -12,13 +12,7 @@ MSM_DRI_SRCS += msm-drm.c msm-dri2.c
+ msm_drv_la_LIBADD += $(DRI2_LIBS)
+ endif
+ 
+-NEON_CFLAGS=-march=armv7-a -mfpu=neon -mfloat-abi=softfp
+-NEON_CCASFLAGS=$(NEON_CFLAGS) -mthumb-interwork
+-NEON_ASFLAGS=-k -mcpu=cortex-a8 $(NEON_CCASFLAGS)
+-
+-AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ $(NEON_CFLAGS) -Wall -Werror
+-AM_ASFLAGS = $(NEON_ASFLAGS)
+-AM_CCASFLAGS = $(NEON_CCASFLAGS)
++AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ -Wall -Werror
+ 
+ msm_drv_la_LTLIBRARIES = msm_drv.la
+ msm_drv_la_LDFLAGS = -module -avoid-version
+@@ -37,9 +31,6 @@ msm_drv_la_SOURCES = \
+ 	 msm-swfill.c \
+ 	 msm-hwrender.c \
+ 	 msm-pixmap.c \
+-	 neon_memsets.c \
+-	 neon_memcpy.S \
+-	 neon_memmove.S \
+ 	$(MSM_DRI_SRCS)
+ 
+ 
diff --git a/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch b/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch
new file mode 100644
index 0000000000..90dd31f605
--- /dev/null
+++ b/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch
@@ -0,0 +1,116 @@
+commit cc83ba5835d5b55347fd0c0775156493b0cf3a15
+Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
+Date:   Thu Feb 11 16:26:52 2010 +0100
+
+    Renaming variables for getting Xorg (xf86-video-msm) work
+    under linux-leviathan (htcdream):
+    cd src
+    sed 's/fixed_info/fix/' -i *.h
+    sed 's/fixed_info/fix/' -i *.c
+
+diff --git git/src/msm-dri.c git/src/msm-dri.c
+index a51d3bd..a74368b 100644
+--- git/src/msm-dri.c
++++ git/src/msm-dri.c
+@@ -151,10 +151,10 @@ MSMDRIScreenInit(ScreenPtr pScreen)
+    pDRIInfo->ddxDriverMinorVersion = 0;
+    pDRIInfo->ddxDriverPatchVersion = 0;
+ 
+-   pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fixed_info.smem_start;
++   pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fix.smem_start;
+ 
+-   pDRIInfo->frameBufferSize = pMsm->fixed_info.smem_len;
+-   pDRIInfo->frameBufferStride = pMsm->fixed_info.line_length;
++   pDRIInfo->frameBufferSize = pMsm->fix.smem_len;
++   pDRIInfo->frameBufferStride = pMsm->fix.line_length;
+ 
+    /* FIXME: How many drawables can we do (should we do)? */
+ 
+diff --git git/src/msm-driver.c git/src/msm-driver.c
+index 803197f..15378f8 100755
+--- git/src/msm-driver.c
++++ git/src/msm-driver.c
+@@ -399,7 +399,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
+ 
+    /* Get the fixed info (par) structure */
+ 
+-   if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fixed_info)) {
++   if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fix)) {
+       xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+ 		 "Unable to read hardware info from %s: %s\n",
+ 		 dev, strerror(errno));
+@@ -410,7 +410,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
+    /* Parse the ID and figure out what version of the MDP and what
+     * panel ID we have */
+ 
+-   if (sscanf(pMsm->fixed_info.id, "msmfb%d_%x", &mdpver, &panelid) < 2) {
++   if (sscanf(pMsm->fix.id, "msmfb%d_%x", &mdpver, &panelid) < 2) {
+ 
+       xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+ 		 "Unable to determine the MDP and panel type\n");
+@@ -435,7 +435,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
+     * the fbdev driver to allocate memory.   In the mean time, we
+     * just reuse the framebuffer memory */
+ 
+-   pScrn->videoRam = pMsm->fixed_info.smem_len;
++   pScrn->videoRam = pMsm->fix.smem_len;
+ 
+    /* Get the current screen setting */
+    if (ioctl(pMsm->fd, FBIOGET_VSCREENINFO, &pMsm->mode_info)) {
+@@ -671,8 +671,8 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
+    /* The framebuffer driver should always report the line length,
+     * but in case it doesn't, we can calculate it ourselves */
+ 
+-   if (pMsm->fixed_info.line_length) {
+-      pScrn->displayWidth = pMsm->fixed_info.line_length;
++   if (pMsm->fix.line_length) {
++      pScrn->displayWidth = pMsm->fix.line_length;
+    } else {
+       pScrn->displayWidth = pMsm->mode_info.xres_virtual *
+ 	    pMsm->mode_info.bits_per_pixel / 8;
+@@ -811,7 +811,7 @@ MSMCloseScreen(int scrnIndex, ScreenPtr pScreen)
+ #endif
+ 
+    /* Unmap the framebuffer memory */
+-   munmap(pMsm->fbmem, pMsm->fixed_info.smem_len);
++   munmap(pMsm->fbmem, pMsm->fix.smem_len);
+ 
+    pScreen->CloseScreen = pMsm->CloseScreen;
+ 
+@@ -857,7 +857,7 @@ MSMScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
+ #endif // defined (MSMFB_GET_PAGE_PROTECTION) && defined (MSMFB_SET_PAGE_PROTECTION)
+ 
+    /* Map the framebuffer memory */
+-   pMsm->fbmem = mmap(NULL, pMsm->fixed_info.smem_len,
++   pMsm->fbmem = mmap(NULL, pMsm->fix.smem_len,
+ 		      PROT_READ | PROT_WRITE, MAP_SHARED, pMsm->fd, 0);
+ 
+    /* If we can't map the memory, then this is a short trip */
+diff --git git/src/msm-exa.c git/src/msm-exa.c
+index 301923f..ce16a93 100755
+--- git/src/msm-exa.c
++++ git/src/msm-exa.c
+@@ -740,8 +740,8 @@ MSMSetupExa(ScreenPtr pScreen)
+    pExa->flags = EXA_OFFSCREEN_PIXMAPS;
+ 
+    pExa->offScreenBase =
+-       (pMsm->fixed_info.line_length * pMsm->mode_info.yres);
+-   pExa->memorySize = pMsm->fixed_info.smem_len;
++       (pMsm->fix.line_length * pMsm->mode_info.yres);
++   pExa->memorySize = pMsm->fix.smem_len;
+ 
+    /* Align pixmap offsets along page boundaries */
+    pExa->pixmapOffsetAlign = 4096;
+diff --git git/src/msm.h git/src/msm.h
+index e1e2bc7..520d390 100755
+--- git/src/msm.h
++++ git/src/msm.h
+@@ -85,7 +85,7 @@ typedef struct _MSMRec
+    int fd;
+ 
+    /* Fixed and var strutures from the framebuffer */
+-   struct fb_fix_screeninfo fixed_info;
++   struct fb_fix_screeninfo fix;
+    struct fb_var_screeninfo mode_info;
+ 
+    /* Pointer to the mapped framebuffer memory */
diff --git a/recipes/xorg-driver/xf86-video-msm_git.bb b/recipes/xorg-driver/xf86-video-msm_git.bb
index faccb87e35..4723b867f0 100644
--- a/recipes/xorg-driver/xf86-video-msm_git.bb
+++ b/recipes/xorg-driver/xf86-video-msm_git.bb
@@ -8,9 +8,15 @@ SRCREV = "5f7df59155ae301a3ebc40aec22ed16d203cb5fc"
 PV = "1.1.0+${PR}+gitr${SRCREV}"
 PE = "1"
 
-SRC_URI = "git://codeaurora.org/quic/xwin/xf86-video-msm.git;protocol=git\
-          "
+SRC_URI = "git://codeaurora.org/quic/xwin/xf86-video-msm.git;protocol=git"
+SRC_URI_htcdream = "git://codeaurora.org/quic/xwin/xf86-video-msm.git;protocol=git \
+	file://no_neon.patch;patch=1 \
+	file://no_neon_flags.patch;patch=1 \
+	file://renaming_variables.patch;patch=1"
 
 S = "${WORKDIR}/git"
 
 CFLAGS += " -I${STAGING_INCDIR}/xorg "
+CFLAGS += " -Wno-error "
+
+ARM_INSTRUCTION_SET="arm"
author	David LanzendÃ¶rfer <david.lanzendoerfer@o2s.ch>	2010-03-17 21:25:38 +0100
committer	Lukas Gorris <lukas.gorris@gmail.com>	2010-03-17 21:25:38 +0100
commit	d695f337bdaa0e297ad89c6fdd99edf97bc270db (patch)
tree	23080a6660c4a5e76c82ef47b587e4c304bdf43b /recipes/xorg-driver
parent	6c5f9d4325be253fb3cb9b6d3bec11f7a7a13562 (diff)
download	openembedded-d695f337bdaa0e297ad89c6fdd99edf97bc270db.tar.gz