xorg-driver : moved unused files to obsolete dir

Signed-off-by: Frans Meulenbroeks <fransmeulenbroeks@gmail.com>
author: Frans Meulenbroeks <fransmeulenbroeks@gmail.com> 2010-10-10 13:08:14 +0200
committer: Frans Meulenbroeks <fransmeulenbroeks@gmail.com> 2010-10-10 13:08:14 +0200
commit: 067e6a9f69e91cf1d1fe46c972189791da1a8a7a (patch)
tree: 5055c1acdf03f0eb1a26a2344ab0cb00f9cc475f /recipes/xorg-driver
parent: 65abd0ee8631cad3fcd984a56bbcafb084cbbb54 (diff)
download: openembedded-067e6a9f69e91cf1d1fe46c972189791da1a8a7a.tar.gz
4 files changed, 0 insertions, 3075 deletions
diff --git a/recipes/xorg-driver/xf86-input-tslib/dontfloodevents006.patch b/recipes/xorg-driver/xf86-input-tslib/dontfloodevents006.patch
deleted file mode 100644
index e989717d3b..0000000000
--- a/recipes/xorg-driver/xf86-input-tslib/dontfloodevents006.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-Index: xf86-input-tslib-0.0.6/src/tslib.c
-===================================================================
---- xf86-input-tslib-0.0.6.orig/src/tslib.c	2009-11-29 20:03:29.734794324 +0000
-+++ xf86-input-tslib-0.0.6/src/tslib.c	2009-11-29 20:29:24.066794215 +0000
-@@ -205,7 +205,7 @@
- 		 */
- 		switch (priv->state) {
- 			 case BUTTON_EMULATION_OFF :
--				 if(priv->lastp != samp.pressure) {
-+				 if(!!priv->lastp != !!samp.pressure) {
- 					 priv->lastp = samp.pressure;
- 					 xf86PostButtonEvent(local->dev, TRUE,
- 						 1, !!samp.pressure, 0, 2,
-@@ -512,7 +512,7 @@
-  	s = xf86CheckStrOption(dev->commonOptions, "path", NULL);
-   	if (!s)
- 		s = xf86CheckStrOption(dev->commonOptions, "Device", NULL);
-- 
-+
- 	priv->ts = ts_open(s, 1);
- 	xfree(s);
- 
diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon.patch b/recipes/xorg-driver/xf86-video-msm/no_neon.patch
deleted file mode 100644
index c0aa92e76a..0000000000
--- a/recipes/xorg-driver/xf86-video-msm/no_neon.patch
+++ /dev/null
@@ -1,2901 +0,0 @@
-commit d8910bf773fbecf7cdea359d4b530a3672e27180
-Author: David Lanzend�rfer <david.lanzendoerfer@o2s.ch>
-Date:   Wed Feb 10 16:18:39 2010 +0100
-
-    Removed neon because its not available in our kerne�l
-    and so its causing trubble (Illegal instruction)
-
-diff --git git/src/msm-swblits.h git/src/msm-swblits.h
-index f89f00e..a40b24b 100755
---- git/src/msm-swblits.h
-+++ git/src/msm-swblits.h
-@@ -38,16 +38,6 @@
- #include <stdint.h>
- #include <stdlib.h>
- 
--/* Neon intrinsics are part of the ARM or GCC compiler used.                                                              */
--/* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */
--#include <arm_neon.h>
--
--/* These are NEON-optimized functions linked to by various tests.  */
--extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes);
--extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes);
--extern void memset16(uint16_t *dst, uint16_t value, int count);
--extern void memset32(uint32_t *dst, uint32_t value, int count);
--
- /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */
- #define BITS_PER_BYTE (8)
- #define BYTES_PER_16BPP_PIXEL (2)
-diff --git git/src/msm-swfill.c git/src/msm-swfill.c
-index 108fd94..3dd1ef2 100755
---- git/src/msm-swfill.c
-+++ git/src/msm-swfill.c
-@@ -212,7 +212,7 @@ memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int cou
-    }
- }
- 
--
-+/*
- static inline void
- memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
- {
-@@ -333,7 +333,7 @@ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
-    // Quickly fill remaining pixels (up to 7).
-    memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count);
- }
--
-+*/
- 
- static inline void
- memset16_Test(uint16_t *dst, uint16_t src, int count)
-@@ -368,7 +368,8 @@ memset16_Test(uint16_t *dst, uint16_t src, int count)
- 
-       // Copy remaining pixels using Neon and non-Neon instructions.
-       // NOTE: This assumes that dst is aligned optimally for Neon instructions.
--      memset16_AssumesNeonAlignment((void *) dst, src, count);
-+      //memset16_AssumesNeonAlignment((void *) dst, src, count);
-+      memset((void *) dst, src, count);
-    }
- }
- 
-@@ -435,12 +436,14 @@ swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dp
-    if (w < 32) {
-       // For narrow rectangles, block signals only once for the entire rectangles.
-       BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
--      DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
-+      //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
-+      DO_MULTIPLE_FILLS_WITH_MEMSET(memset,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
-       UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
-    }
-    else {
-       // For wider rectangles, block and unblock signals for every row.
--      DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+      //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+      DO_MULTIPLE_FILLS_WITH_MEMSET(memset,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-    }
- }
- 
-diff --git git/src/msm-swrender.c git/src/msm-swrender.c
-index a7a9abc..835dc03 100755
---- git/src/msm-swrender.c
-+++ git/src/msm-swrender.c
-@@ -214,160 +214,6 @@ swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src
-                   }
-                }
-                break;
--      case  7: if (xdir >= 0) {
--                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
--                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
--                  return TRUE;
--               } else {
--                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
--                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
--                  return TRUE;
--               }
--               break;
--      case  8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
--                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
--                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
--                  return TRUE;
--               }
--               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
--                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
--                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
--                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
--                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
--                  return TRUE;
--               }
--               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
--                  uint32_t src1  = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
--                  uint32_t src2  = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
--                  uint32_t src3  = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
--                  uint32_t src4  = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
--                  *(uint32_t *) (dst+0*BYTES_PER_UINT32_T)  = src1;
--                  *(uint32_t *) (dst+1*BYTES_PER_UINT32_T)  = src2;
--                  *(uint32_t *) (dst+2*BYTES_PER_UINT32_T)  = src3;
--                  *(uint32_t *) (dst+3*BYTES_PER_UINT32_T)  = src4;
--                  return TRUE;
--               }
--               else {
--                  uint16_t src1  = *(uint16_t *) (src+0*BYTES_PER_UINT16_T);
--                  uint16_t src2  = *(uint16_t *) (src+1*BYTES_PER_UINT16_T);
--                  uint16_t src3  = *(uint16_t *) (src+2*BYTES_PER_UINT16_T);
--                  uint16_t src4  = *(uint16_t *) (src+3*BYTES_PER_UINT16_T);
--                  uint16_t src5  = *(uint16_t *) (src+4*BYTES_PER_UINT16_T);
--                  uint16_t src6  = *(uint16_t *) (src+5*BYTES_PER_UINT16_T);
--                  uint16_t src7  = *(uint16_t *) (src+6*BYTES_PER_UINT16_T);
--                  uint16_t src8  = *(uint16_t *) (src+7*BYTES_PER_UINT16_T);
--                  *(uint16_t *) (dst+0*BYTES_PER_UINT16_T)  = src1;
--                  *(uint16_t *) (dst+1*BYTES_PER_UINT16_T)  = src2;
--                  *(uint16_t *) (dst+2*BYTES_PER_UINT16_T)  = src3;
--                  *(uint16_t *) (dst+3*BYTES_PER_UINT16_T)  = src4;
--                  *(uint16_t *) (dst+4*BYTES_PER_UINT16_T)  = src5;
--                  *(uint16_t *) (dst+5*BYTES_PER_UINT16_T)  = src6;
--                  *(uint16_t *) (dst+6*BYTES_PER_UINT16_T)  = src7;
--                  *(uint16_t *) (dst+7*BYTES_PER_UINT16_T)  = src8;
--                  return TRUE;
--               }
--               break;
--      case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
--                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
--                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
--                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
--                  return TRUE;
--               }
--               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
--                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
--                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
--                  uint64_t src3  = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
--                  uint64_t src4  = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
--                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
--                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
--                  *(uint64_t *) (dst+2*BYTES_PER_UINT64_T)  = src3;
--                  *(uint64_t *) (dst+3*BYTES_PER_UINT64_T)  = src4;
--                  return TRUE;
--               }
--               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
--                  uint32_t src1  = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
--                  uint32_t src2  = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
--                  uint32_t src3  = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
--                  uint32_t src4  = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
--                  uint32_t src5  = *(uint32_t *) (src+4*BYTES_PER_UINT32_T);
--                  uint32_t src6  = *(uint32_t *) (src+5*BYTES_PER_UINT32_T);
--                  uint32_t src7  = *(uint32_t *) (src+6*BYTES_PER_UINT32_T);
--                  uint32_t src8  = *(uint32_t *) (src+7*BYTES_PER_UINT32_T);
--                  *(uint32_t *) (dst+0*BYTES_PER_UINT32_T)  = src1;
--                  *(uint32_t *) (dst+1*BYTES_PER_UINT32_T)  = src2;
--                  *(uint32_t *) (dst+2*BYTES_PER_UINT32_T)  = src3;
--                  *(uint32_t *) (dst+3*BYTES_PER_UINT32_T)  = src4;
--                  *(uint32_t *) (dst+4*BYTES_PER_UINT32_T)  = src5;
--                  *(uint32_t *) (dst+5*BYTES_PER_UINT32_T)  = src6;
--                  *(uint32_t *) (dst+6*BYTES_PER_UINT32_T)  = src7;
--                  *(uint32_t *) (dst+7*BYTES_PER_UINT32_T)  = src8;
--                  return TRUE;
--               }
--               else {
--                  // Don't bother unrolling loops here, since that won't help for more than around 8 operations.
--                  // Instead, just call multiple fixed functions.
--                  if (xdir >= 0) {
--                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
--                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
--                  } else {
--                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
--                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
--                  }
--                  return TRUE;
--               }
--               break;
--      case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
--                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
--                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
--                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
--                  vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
--                  vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
--                  return TRUE;
--               }
--               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
--                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
--                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
--                  uint64_t src3  = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
--                  uint64_t src4  = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
--                  uint64_t src5  = *(uint64_t *) (src+4*BYTES_PER_UINT64_T);
--                  uint64_t src6  = *(uint64_t *) (src+5*BYTES_PER_UINT64_T);
--                  uint64_t src7  = *(uint64_t *) (src+6*BYTES_PER_UINT64_T);
--                  uint64_t src8  = *(uint64_t *) (src+7*BYTES_PER_UINT64_T);
--                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
--                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
--                  *(uint64_t *) (dst+2*BYTES_PER_UINT64_T)  = src3;
--                  *(uint64_t *) (dst+3*BYTES_PER_UINT64_T)  = src4;
--                  *(uint64_t *) (dst+4*BYTES_PER_UINT64_T)  = src5;
--                  *(uint64_t *) (dst+5*BYTES_PER_UINT64_T)  = src6;
--                  *(uint64_t *) (dst+6*BYTES_PER_UINT64_T)  = src7;
--                  *(uint64_t *) (dst+7*BYTES_PER_UINT64_T)  = src8;
--                  return TRUE;
--               }
--               break;
--      case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
--                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T));
--                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
--                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
--                  vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
--                  vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
--                  vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5);
--                  vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6);
--                  vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7);
--                  vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8);
--                  return TRUE;
--               }
--               break;
-    }
- 
-    return FALSE;
-@@ -519,427 +365,6 @@ swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *sr
-                }
-                return TRUE;
-                break;
--      case  7: if (xdir >= 0) {
--                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
--                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
--               } else {
--                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
--                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
--               }
--               return TRUE;
--               break;
--      case  8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
--                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
--                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
--                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
--                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
--                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
--                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
--                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
--                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
--                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
--                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
--                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
--                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
--                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
--                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
--                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
--                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
--                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
--                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
--                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
--                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
--                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
--                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
--                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
--                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
--                  uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
--                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
--                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
--                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
--                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
--                  uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
--                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
--                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
--                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
--                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
--                  return TRUE;
--               }
--               else {
--                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
--                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
--                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
--                  uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
--                  uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
--                  uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
--                  uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
--                  uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
--                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
--                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
--                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
--                  uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
--                  uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
--                  uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
--                  uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
--                  uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
--                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
--                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
--                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
--                  *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T)  = src4a;
--                  *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T)  = src5a;
--                  *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T)  = src6a;
--                  *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T)  = src7a;
--                  *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T)  = src8a;
--                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
--                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
--                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
--                  *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T)  = src4b;
--                  *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T)  = src5b;
--                  *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T)  = src6b;
--                  *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T)  = src7b;
--                  *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T)  = src8b;
--                  return TRUE;
--               }
--               break;
--      case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
--                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
--                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
--                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
--                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
--                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
--                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
--                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
--                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
--                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
--                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
--                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
--                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
--                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) {
--                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
--                  uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
--                  uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
--                  uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
--                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
--                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
--                  uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
--                  uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
--                  uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
--                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
--                  *(uint32_t *) (dst+0*dpitch+0)                                        = src1a;
--                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2a;
--                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3a;
--                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5a;
--                  *(uint32_t *) (dst+1*dpitch+0)                                        = src1b;
--                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2b;
--                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3b;
--                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5b;
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
--                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
--                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
--                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
--                  uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
--                  uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
--                  uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
--                  uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
--                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
--                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
--                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
--                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
--                  uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
--                  uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
--                  uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
--                  uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
--                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
--                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
--                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
--                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
--                  *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T)  = src5a;
--                  *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T)  = src6a;
--                  *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T)  = src7a;
--                  *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T)  = src8a;
--                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
--                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
--                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
--                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
--                  *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T)  = src5b;
--                  *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T)  = src6b;
--                  *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T)  = src7b;
--                  *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T)  = src8b;
--                  return TRUE;
--               }
--               else {
--                  // Don't bother unrolling loops, since that won't help for more than around 8 operations.
--                  // Instead, just call multiple fixed functions.
--                  if (xdir >= 0) {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                  } else {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
--                  }
--                  return TRUE;
--               }
--               break;
--      case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
--                  if (xdir >= 0) {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                           src + 0,                            4, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T,      src + (4)*BYTES_PER_UINT16_T,      16, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T,   src + (16+4)*BYTES_PER_UINT16_T,    8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T,  4, xdir, dpitch, spitch);
--                  } else {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T,  4, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T,   src + (16+4)*BYTES_PER_UINT16_T,    8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T,      src + (4)*BYTES_PER_UINT16_T,      16, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                           src + 0,                            4, xdir, dpitch, spitch);
--                   }
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
--                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
--                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
--                  uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T);
--                  uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T);
--                  uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T);
--                  uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T);
--                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
--                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
--                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
--                  uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T);
--                  uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T);
--                  uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T);
--                  uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T);
--                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
--                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
--                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
--                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
--                  *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T)  = src5a;
--                  *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T)  = src6a;
--                  *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T)  = src7a;
--                  *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T)  = src8a;
--                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
--                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
--                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
--                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
--                  *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T)  = src5b;
--                  *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T)  = src6b;
--                  *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T)  = src7b;
--                  *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T)  = src8b;
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
--                  if (xdir >= 0) {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
--                  } else {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
--                  }
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
--                  if (xdir >= 0) {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
--                  } else {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
--                  }
--                  return TRUE;
--               }
--               else {
--                  if (xdir >= 0) {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                  } else {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                  }
--                  return TRUE;
--               }
--               break;
--      case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T));
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4_T),src6a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+6*BYTES_PER_UINT32X4_T),src7a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+7*BYTES_PER_UINT32X4_T),src8a);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+4*BYTES_PER_UINT32X4_T),src5b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+5*BYTES_PER_UINT32X4_T),src6b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+6*BYTES_PER_UINT32X4_T),src7b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+7*BYTES_PER_UINT32X4_T),src8b);
--                  return TRUE;
--               }//HERE
--               else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
--                  if (xdir >= 0) {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                             src + 0,                             4,    xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T,   src + (0*16+4)*BYTES_PER_UINT16_T,   2*16, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T,   src + (2*16+4)*BYTES_PER_UINT16_T,   16,   xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T,   src + (3*16+4)*BYTES_PER_UINT16_T,   8,    xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4,    xdir, dpitch, spitch);
--                  } else {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4,    xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T,   src + (3*16+4)*BYTES_PER_UINT16_T,   8,    xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T,   src + (2*16+4)*BYTES_PER_UINT16_T,   16,   xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T,   src + (0*16+4)*BYTES_PER_UINT16_T,   2*16, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                             src + 0,                             4,    xdir, dpitch, spitch);
--                  }
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
--                  if (xdir >= 0) {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
--                  } else {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
--                  }
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
--                  if (xdir >= 0) {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
--                  } else {
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
--                  }
--                  return TRUE;
--               }
--               break;
-    }
- 
-    return FALSE;
-@@ -1161,484 +586,7 @@ swCopy16BppSmallFixedWidths4Rows_Unaligned(unsigned char *dst, unsigned char *sr
-                }
-                return TRUE;
-                break;
--      case  7: if (xdir >= 0) {
--                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
--                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
--               } else {
--                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
--                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
--               }
--               return TRUE;
--               break;
--      // TODO: Add more alignment checks for 8 pixel-wide cases for performance reasons?
--      //       For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,DOUBLE_WORD_ALIGNMENT_BYTE_SIZE/2)) and related half-aligned cases...
--      case  8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
--                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
--                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
--                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
--                  uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
--                  uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
--                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
--                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
--                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
--                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
--                  *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T)  = src1c;
--                  *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T)  = src2c;
--                  *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T)  = src1d;
--                  *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T)  = src2d;
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
--                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
--                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
--                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
--                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
--                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
--                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
--                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
--                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
--                  uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
--                  uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
--                  uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
--                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
--                  uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
--                  uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
--                  uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
--                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
--                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
--                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
--                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
--                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
--                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
--                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
--                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
--                  *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T)  = src1c;
--                  *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T)  = src2c;
--                  *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T)  = src3c;
--                  *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T)  = src4c;
--                  *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T)  = src1d;
--                  *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T)  = src2d;
--                  *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T)  = src3d;
--                  *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T)  = src4d;
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
--                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
--                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
--                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
--                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
--                  uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
--                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
--                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
--                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
--                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
--                  uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
--                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
--                  uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
--                  uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
--                  uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
--                  uint16_t src5c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
--                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
--                  uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
--                  uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
--                  uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
--                  uint16_t src5d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
--                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
--                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
--                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
--                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
--                  *(uint16_t *) (dst+2*dpitch+0)                                        = src1c;
--                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2c;
--                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3c;
--                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4c;
--                  *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5c;
--                  *(uint16_t *) (dst+3*dpitch+0)                                        = src1d;
--                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2d;
--                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3d;
--                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4d;
--                  *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5d;
--                  return TRUE;
--               }
--               else {
--                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
--                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
--                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
--                  uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
--                  uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
--                  uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
--                  uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
--                  uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
--                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
--                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
--                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
--                  uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
--                  uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
--                  uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
--                  uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
--                  uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
--                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T);
--                  uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T);
--                  uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T);
--                  uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T);
--                  uint16_t src5c = *(uint16_t *) (src+2*spitch+4*BYTES_PER_UINT16_T);
--                  uint16_t src6c = *(uint16_t *) (src+2*spitch+5*BYTES_PER_UINT16_T);
--                  uint16_t src7c = *(uint16_t *) (src+2*spitch+6*BYTES_PER_UINT16_T);
--                  uint16_t src8c = *(uint16_t *) (src+2*spitch+7*BYTES_PER_UINT16_T);
--                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T);
--                  uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T);
--                  uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T);
--                  uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T);
--                  uint16_t src5d = *(uint16_t *) (src+3*spitch+4*BYTES_PER_UINT16_T);
--                  uint16_t src6d = *(uint16_t *) (src+3*spitch+5*BYTES_PER_UINT16_T);
--                  uint16_t src7d = *(uint16_t *) (src+3*spitch+6*BYTES_PER_UINT16_T);
--                  uint16_t src8d = *(uint16_t *) (src+3*spitch+7*BYTES_PER_UINT16_T);
--                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
--                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
--                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
--                  *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T)  = src4a;
--                  *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T)  = src5a;
--                  *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T)  = src6a;
--                  *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T)  = src7a;
--                  *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T)  = src8a;
--                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
--                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
--                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
--                  *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T)  = src4b;
--                  *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T)  = src5b;
--                  *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T)  = src6b;
--                  *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T)  = src7b;
--                  *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T)  = src8b;
--                  *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T)  = src1c;
--                  *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T)  = src2c;
--                  *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T)  = src3c;
--                  *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T)  = src4c;
--                  *(uint16_t *) (dst+2*dpitch+4*BYTES_PER_UINT16_T)  = src5c;
--                  *(uint16_t *) (dst+2*dpitch+5*BYTES_PER_UINT16_T)  = src6c;
--                  *(uint16_t *) (dst+2*dpitch+6*BYTES_PER_UINT16_T)  = src7c;
--                  *(uint16_t *) (dst+2*dpitch+7*BYTES_PER_UINT16_T)  = src8c;
--                  *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T)  = src1d;
--                  *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T)  = src2d;
--                  *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T)  = src3d;
--                  *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T)  = src4d;
--                  *(uint16_t *) (dst+3*dpitch+4*BYTES_PER_UINT16_T)  = src5d;
--                  *(uint16_t *) (dst+3*dpitch+5*BYTES_PER_UINT16_T)  = src6d;
--                  *(uint16_t *) (dst+3*dpitch+6*BYTES_PER_UINT16_T)  = src7d;
--                  *(uint16_t *) (dst+3*dpitch+7*BYTES_PER_UINT16_T)  = src8d;
--                  return TRUE;
--               }
--               break;
--      case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
--                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
--                  vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
--                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
--                  vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
--                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
--                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
--                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
--                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
--                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
--                  uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
--                  uint64_t src3c = *(uint64_t *) (src+2*spitch+2*BYTES_PER_UINT64_T);
--                  uint64_t src4c = *(uint64_t *) (src+2*spitch+3*BYTES_PER_UINT64_T);
--                  uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
--                  uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
--                  uint64_t src3d = *(uint64_t *) (src+3*spitch+2*BYTES_PER_UINT64_T);
--                  uint64_t src4d = *(uint64_t *) (src+3*spitch+3*BYTES_PER_UINT64_T);
--                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
--                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
--                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
--                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
--                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
--                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
--                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
--                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
--                  *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T)  = src1c;
--                  *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T)  = src2c;
--                  *(uint64_t *) (dst+2*dpitch+2*BYTES_PER_UINT64_T)  = src3c;
--                  *(uint64_t *) (dst+2*dpitch+3*BYTES_PER_UINT64_T)  = src4c;
--                  *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T)  = src1d;
--                  *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T)  = src2d;
--                  *(uint64_t *) (dst+3*dpitch+2*BYTES_PER_UINT64_T)  = src3d;
--                  *(uint64_t *) (dst+3*dpitch+3*BYTES_PER_UINT64_T)  = src4d;
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
--                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
--                  uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
--                  uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
--                  uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
--                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
--                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
--                  uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
--                  uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
--                  uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
--                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
--                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0);
--                  uint64_t src2c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
--                  uint64_t src3c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
--                  uint64_t src4c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
--                  uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
--                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0);
--                  uint64_t src2d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
--                  uint64_t src3d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
--                  uint64_t src4d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
--                  uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
--                  *(uint32_t *) (dst+0*dpitch+0)                                        = src1a;
--                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2a;
--                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3a;
--                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5a;
--                  *(uint32_t *) (dst+1*dpitch+0)                                        = src1b;
--                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2b;
--                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3b;
--                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5b;
--                  *(uint32_t *) (dst+2*dpitch+0)                                        = src1c;
--                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2c;
--                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3c;
--                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4c;
--                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5c;
--                  *(uint32_t *) (dst+3*dpitch+0)                                        = src1d;
--                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2d;
--                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3d;
--                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4d;
--                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5d;
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
--                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
--                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
--                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
--                  uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
--                  uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
--                  uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
--                  uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
--                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
--                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
--                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
--                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
--                  uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
--                  uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
--                  uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
--                  uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
--                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
--                  uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
--                  uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
--                  uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
--                  uint32_t src5c = *(uint32_t *) (src+2*spitch+4*BYTES_PER_UINT32_T);
--                  uint32_t src6c = *(uint32_t *) (src+2*spitch+5*BYTES_PER_UINT32_T);
--                  uint32_t src7c = *(uint32_t *) (src+2*spitch+6*BYTES_PER_UINT32_T);
--                  uint32_t src8c = *(uint32_t *) (src+2*spitch+7*BYTES_PER_UINT32_T);
--                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
--                  uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
--                  uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
--                  uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
--                  uint32_t src5d = *(uint32_t *) (src+3*spitch+4*BYTES_PER_UINT32_T);
--                  uint32_t src6d = *(uint32_t *) (src+3*spitch+5*BYTES_PER_UINT32_T);
--                  uint32_t src7d = *(uint32_t *) (src+3*spitch+6*BYTES_PER_UINT32_T);
--                  uint32_t src8d = *(uint32_t *) (src+3*spitch+7*BYTES_PER_UINT32_T);
--                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
--                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
--                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
--                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
--                  *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T)  = src5a;
--                  *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T)  = src6a;
--                  *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T)  = src7a;
--                  *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T)  = src8a;
--                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
--                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
--                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
--                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
--                  *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T)  = src5b;
--                  *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T)  = src6b;
--                  *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T)  = src7b;
--                  *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T)  = src8b;
--                  *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T)  = src1c;
--                  *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T)  = src2c;
--                  *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T)  = src3c;
--                  *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T)  = src4c;
--                  *(uint32_t *) (dst+2*dpitch+4*BYTES_PER_UINT32_T)  = src5c;
--                  *(uint32_t *) (dst+2*dpitch+5*BYTES_PER_UINT32_T)  = src6c;
--                  *(uint32_t *) (dst+2*dpitch+6*BYTES_PER_UINT32_T)  = src7c;
--                  *(uint32_t *) (dst+2*dpitch+7*BYTES_PER_UINT32_T)  = src8c;
--                  *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T)  = src1d;
--                  *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T)  = src2d;
--                  *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T)  = src3d;
--                  *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T)  = src4d;
--                  *(uint32_t *) (dst+3*dpitch+4*BYTES_PER_UINT32_T)  = src5d;
--                  *(uint32_t *) (dst+3*dpitch+5*BYTES_PER_UINT32_T)  = src6d;
--                  *(uint32_t *) (dst+3*dpitch+6*BYTES_PER_UINT32_T)  = src7d;
--                  *(uint32_t *) (dst+3*dpitch+7*BYTES_PER_UINT32_T)  = src8d;
--                  return TRUE;
--               }
--               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
--                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
--                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
--                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
--                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
--                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
--                  uint32_t src6a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
--                  uint32_t src7a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
--                  uint32_t src8a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
--                  uint16_t src9a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
--                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
--                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
--                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
--                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
--                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
--                  uint32_t src6b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
--                  uint32_t src7b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
--                  uint32_t src8b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
--                  uint16_t src9b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
--                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
--                  uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
--                  uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
--                  uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
--                  uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
--                  uint32_t src6c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
--                  uint32_t src7c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
--                  uint32_t src8c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
--                  uint16_t src9c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
--                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
--                  uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
--                  uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
--                  uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
--                  uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
--                  uint32_t src6d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
--                  uint32_t src7d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
--                  uint32_t src8d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
--                  uint16_t src9d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
--                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7a;
--                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8a;
--                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9a;
--                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7b;
--                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8b;
--                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9b;
--                  *(uint16_t *) (dst+2*dpitch+0)                                        = src1c;
--                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2c;
--                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3c;
--                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4c;
--                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5c;
--                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6c;
--                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7c;
--                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8c;
--                  *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9c;
--                  *(uint16_t *) (dst+3*dpitch+0)                                        = src1d;
--                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2d;
--                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3d;
--                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4d;
--                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5d;
--                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6d;
--                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7d;
--                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8d;
--                  *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9d;
--                  return TRUE;
--               }
--               else {
--                  // Don't bother unrolling loops, since that won't help for more than around 8 operations.
--                  // Instead, just call multiple fixed functions.
--                  if (xdir >= 0) {
--                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                  } else {
--                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
--                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
--                  }
--                  return TRUE;
--               }
--               break;
--      // TODO: Add more alignment checks for 32 pixel-wide cases for performance reasons?
--      //       For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,XXX)) and related cases could make a big difference here...
--      case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
--                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src3c = vld1q_u32((uint32_t *)(src+2*spitch+2*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src4c = vld1q_u32((uint32_t *)(src+2*spitch+3*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src3d = vld1q_u32((uint32_t *)(src+3*spitch+2*BYTES_PER_UINT32X4_T));
--                  uint32x4_t src4d = vld1q_u32((uint32_t *)(src+3*spitch+3*BYTES_PER_UINT32X4_T));
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
--                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
--                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
--                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
--                  vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
--                  vst1q_u32((uint32_t *)(dst+2*dpitch+2*BYTES_PER_UINT32X4_T),src3c);
--                  vst1q_u32((uint32_t *)(dst+2*dpitch+3*BYTES_PER_UINT32X4_T),src4c);
--                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
--                  vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
--                  vst1q_u32((uint32_t *)(dst+3*dpitch+2*BYTES_PER_UINT32X4_T),src3d);
--                  vst1q_u32((uint32_t *)(dst+3*dpitch+3*BYTES_PER_UINT32X4_T),src4d);
--                  return TRUE;
--               }
--               break;
--   }
-+         }
- 
-    return FALSE;
- }
-@@ -1924,10 +872,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int
-       if (rowsOverlap)
-       {
-          if (w > 64) {
--            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+	    DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-          }
-          else if (w == 64) {
--            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+	    DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-          }
-          else {
-             DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
-@@ -1936,10 +886,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int
-       else
-       {
-          if (w > 64) {
--            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+	    DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-          }
-          else if (w == 64) {
--            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+	    DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-          }
-          else {
-             DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
-@@ -1973,7 +925,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i
-       if (xdir >= 0 || !rowsOverlap) {
-          if (w >= 128) {
-             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
--            neon_memcpy(dst, src, w);
-+            //neon_memcpy(dst, src, w);
-+	    memcpy(dst, src, w);
-             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
-          }
-          else
-@@ -1982,7 +935,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i
-       else {
-          if (w >= 128) {
-             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
--            neon_memmove(dst, src, w);
-+            //neon_memmove(dst, src, w);
-+	    memmove(dst, src, w);
-             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
-          }
-          else
-@@ -2029,7 +983,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h,
-       if (xdir >= 0 || !rowsOverlap) {
-          if (w >= 42) {
-             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
--            neon_memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
-+            //neon_memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
-+	    memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
-             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
-          }
-          else
-@@ -2038,7 +993,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h,
-       else {
-          if (w >= 42) {
-             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
--            neon_memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
-+            //neon_memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
-+	    memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
-             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
-          }
-          else
-diff --git git/src/neon_memcpy.S git/src/neon_memcpy.S
-deleted file mode 100644
-index 5ecc5ce..0000000
---- git/src/neon_memcpy.S
-+++ /dev/null
-@@ -1,549 +0,0 @@
--/***************************************************************************
-- Copyright (c) 2009, Code Aurora Forum. All rights reserved.
--
-- Redistribution and use in source and binary forms, with or without
-- modification, are permitted provided that the following conditions are met:
--     * Redistributions of source code must retain the above copyright
--       notice, this list of conditions and the following disclaimer.
--     * Redistributions in binary form must reproduce the above copyright
--       notice, this list of conditions and the following disclaimer in the
--       documentation and/or other materials provided with the distribution.
--     * Neither the name of Code Aurora nor the names of its contributors may
--       be used to endorse or promote products derived from this software
--       without specific prior written permission.
--
-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-- POSSIBILITY OF SUCH DAMAGE.
-- ***************************************************************************/
--
--/***************************************************************************
--  Neon memcpy: Attempts to do a memcpy with Neon registers if possible,
--     Inputs:
--        dest: The destination buffer
--        src: The source buffer
--        n: The size of the buffer to transfer
--     Outputs:
--
--***************************************************************************/
--
--/*
-- * General note:
-- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
-- * However, it looks like the 2006 CodeSourcery Assembler has issues generating
-- * the correct object code for VPOP, resulting in horrific stack crashes.
-- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
-- * and VPOP->VLDMIA.  We can revert this back once we update our toolchain.
-- *
-- * Also, VSHL swaps the source register and the shift-amount register
-- * around in 2006-q3.  I've coded this incorrectly so it turns out correct
-- * in the object code, but we'll need to undo that later...
-- */
--
--	.code 32
--	.align 4
--	.globl neon_memcpy
--	.func
--
--neon_memcpy:
--	/* 
--	 * First, make sure we're not copying < 4 bytes.  If so, we'll
--         * just handle it here.
--	 */
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	stmdb		sp!, {r0}
--#else
--	push		{r0}
--#endif
--	cmp		r2, #4
--	bgt		neon_gt_4
--	/* Copy 0-4 bytes, if needed, and return.*/
--	cmp		r2, #0
--neon_smallcopy_loop:
--	beq		neon_smallcopy_done
--	ldrb		r12, [r1], #1
--	subs		r2, r2, #1
--	strb		r12, [r0], #1
--	b		neon_smallcopy_loop
--neon_smallcopy_done:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	ldmia		sp!, {r0}
--#else
--	pop		{r0}
--#endif
--	bx		lr
--
--	/* Copy 4 or more bytes*/
--neon_gt_4:
--	/* Preload what we can...*/
--	pld		[r0,#0]
--	pld		[r1,#0]
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	stmdb		sp!, {r4-r5}
--#else
--	push		{r4-r5}
--#endif
--
--neon_check_align:
--	/* Check normal word alignment for target. */
--	ands		r12, r0, #0x3
--	beq		source_alignment_check
--	
--	/*
--	 * Target is not aligned.  Step through until we get that
--	 * word-aligned.  This works better than a loop, according
--	 * to our pipeline modeler.
--	 */
--	cmp		r12, #2
--	ldrb		r3, [r1], #1
--	ldrleb		r4, [r1], #1
--	ldrltb		r5, [r1], #1
--	rsb		r12, r12, #4
--	sub		r2, r2, r12
--	strb		r3, [r0], #1
--	strleb		r4, [r0], #1
--	strltb		r5, [r0], #1
--	
--source_alignment_check:
--	ands		r12, r1, #0x3
--	bne		neon_memcpy_nonaligned	/* Source is not word aligned.*/
--neon_try_16_align:
--	cmp		r2, #64
--	blt		neon_align_route
--	/* This is where we try 16-byte alignment. */
--	ands		r12, r0, #0xf
--	beq		neon_align_route
--	rsb		r12, r12, #16
--neon_16_start:
--	sub		r2, r2, r12
--	lsrs		r3, r12, #2
--neon_align_16_4:
--	ldr		r4, [r1], #4
--	subs		r3, r3, #1
--	str		r4, [r0], #4
--	bne		neon_align_16_4
--neon_align_route:
--	/* In this case, both source and target are word-aligned. */
--	cmp		r2, #32768
--	bge		neon_copy_128p_a
--	cmp		r2, #256
--	bge		neon_copy_128_a
--	cmp		r2, #64
--	bge		neon_copy_32_a
--	b		neon_copy_finish_a
--	nop
--neon_copy_128p_a:
--	/* We'll copy blocks 128-bytes at a time, but try to call pld to
--	 * load in the next page, if possible.
--	 */
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vstmdb		sp!, {q4-q7}
--#else
--	vpush		{q4-q7}
--#endif
--	mov		r12, r2, lsr #7
--neon_copy_128p_loop_a:
--	vld1.32		{q0, q1}, [r1]!
--	vld1.32		{q2, q3}, [r1]!
--	vld1.32		{q4, q5}, [r1]!
--	vld1.32		{q6, q7}, [r1]!
--	pld		[r1, #0]
--	pld		[r1, #1024]
--	vst1.32		{q0, q1}, [r0]!
--	vst1.32		{q2, q3}, [r0]!
--	vst1.32		{q4, q5}, [r0]!
--	vst1.32		{q6, q7}, [r0]!
--	subs		r12, r12, #1
--	bne		neon_copy_128p_loop_a
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vldmia		sp!, {q4-q7}
--#else
--	vpop		{q4-q7}
--#endif
--	ands		r2, r2, #0x7f
--	beq		neon_end
--	cmp		r2, #32
--	blt		neon_copy_finish_a
--	b		neon_copy_32_a
--	/* Copy blocks of 128-bytes (word-aligned) at a time*/
--neon_copy_128_a:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vstmdb		sp!, {q4-q7}
--#else
--	vpush		{q4-q7}
--#endif
--	/*
--	 * Move to a 1-s based countdown to determine when to loop.  That
--	 * allows the subs to set the Z flag without having to explicitly
--	 * call cmp to a value.
--	 */
--	mov		r12, r2, lsr #7
--neon_copy_128_loop_a:
--	vld1.32		{q0, q1}, [r1]!
--	vld1.32		{q2, q3}, [r1]!
--	vld1.32		{q4, q5}, [r1]!
--	vld1.32		{q6, q7}, [r1]!
--	pld		[r1, #0]
--	pld		[r1, #128]
--	vst1.32		{q0, q1}, [r0]!
--	vst1.32		{q2, q3}, [r0]!
--	vst1.32		{q4, q5}, [r0]!
--	vst1.32		{q6, q7}, [r0]!
--	subs		r12, r12, #1
--	pld		[r0, #0]
--	pld		[r0, #128]
--	bne		neon_copy_128_loop_a
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vldmia		sp!, {q4-q7}
--#else
--	vpop		{q4-q7}
--#endif
--	ands		r2, r2, #0x7f
--	beq		neon_end
--	cmp		r2, #32
--	blt		neon_copy_finish_a
--	/* Copy blocks of 32-bytes (word aligned) at a time*/
--neon_copy_32_a:
--	mov		r12, r2, lsr #5
--neon_copy_32_loop_a:
--	vld1.32		{q0,q1}, [r1]!
--	subs		r12, r12, #1
--	pld		[r1,#0]
--	vst1.32		{q0,q1}, [r0]!
--	bne		neon_copy_32_loop_a
--	ands		r2, r2, #0x1f
--	beq		neon_end
--neon_copy_finish_a:
--neon_copy_16_a:
--	movs		r12, r2, lsr #4
--	beq		neon_copy_8_a
--neon_copy_16_a_loop:
--	vld1.32		{q0}, [r1]!
--	subs		r12, r12, #1
--	vst1.32		{q0}, [r0]!
--	bne		neon_copy_16_a_loop
--	ands		r2, r2, #0xf
--	beq		neon_end
--neon_copy_8_a:
--	cmp		r2, #8
--	blt		neon_copy_4_a
--	ldm		r1!, {r4-r5}
--	subs		r2, r2, #8
--	stm		r0!, {r4-r5}
--	/* Copy 4-bytes of word-aligned data at a time*/
--neon_copy_4_a:
--	cmp		r2, #4
--	blt		neon_copy_finish
--	ldr		r4, [r1], #4
--	subs		r2, r2, #4
--	str		r4, [r0], #4
--	b		neon_copy_finish
--
--	/*
--	 * Handle unaligned data.  The basic concept here is that we'll
--	 * try to pull out enough data from the source to get that word-
--	 * aligned, then do our writes word-aligned, storing the difference
--	 * in a register, and shifting the data as needed.
--	 */
--neon_memcpy_nonaligned:
--	/*
--	 * If this is <8 bytes, it makes more sense to just copy it
--	 * quickly instead of incurring all kinds of overhead.
--	 */
--	cmp		r2, #8 /* Let's try this...*/
--	ble		neon_copy_finish
--	/*
--	 * This is where we'll pull out either 1, 2, or 3 bytes of data
--	 * from the source as needed to align it, then store off those
--	 * bytes in r4.  When we read in the (now) aligned data from the
--	 * source, we'll shift the bytes and AND in the r4 data, then write
--	 * to the target aligned.
--	 *
--	 * The conditional ldr calls work slightly faster than the
--	 * previous method, confirmed by our pipeline modeler.
--	 */
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	stmdb		sp!, {r6-r9}
--#else
--	push		{r6-r9}
--#endif
--	cmp		r12, #2
--	ldrb		r4, [r1], #1
--	ldrleb		r5, [r1], #1
--	ldrltb		r6, [r1], #1
--	rsb		r8, r12, #4
--	sub		r2, r2, r8
--	lsl		r8, r8, #3
--	orrle		r4, r4, r5, lsl #8
--	orrlt		r4, r4, r6, lsl #16
--	rsb		r9, r8, #32
--	
--	cmp		r2, #64
--	blt		neon_unaligned_route
--	ands		r12, r0, #0xf
--	beq		neon_unaligned_route
--	rsb		r12, r12, #16
--neon_16_start_u:
--	sub		r2, r2, r12
--	lsrs		r6, r12, #2
--neon_align_16_4_u:
--	ldr		r5, [r1], #4
--	subs		r6, r6, #1
--	orr		r4, r4, r5, lsl r8
--	str		r4, [r0], #4
--	mov		r4, r5, lsr r9
--	bne		neon_align_16_4_u
--neon_unaligned_route:
--	/* Decide which loop block to branch to.*/
--	cmp		r2, #256
--	bge		neon_copy_64_u
--	cmp		r2, #64
--	bge		neon_copy_32_u
--	b		neon_copy_finish_u
--	/* Copy data in 64-byte blocks.*/
--neon_copy_64_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vstmdb		sp!, {q4}
--	vstmdb		sp!, {q5-q8}
--#else
--	vpush		{q4}
--	vpush		{q5-q8}
--#endif
--	/* We'll need this for the q register shift later.*/
--	vdup.u32	q8, r8
--	/*
--	 * As above, we determine how many times we can go through the
--	 * 64-byte copy loop, then countdown.
--	 */
--	mov		r12, r2, lsr #6
--	and		r2, r2, #0x3f
--neon_copy_64_u_loop:
--	/* Load 64-bytes into q4-q7.*/
--	vld1.32		{q4, q5}, [r1]!
--	vld1.32		{q6, q7}, [r1]!
--	/*
--	 * Shift q0-q3 right so everything but the data we need due to the
--	 * alignment falls off the right-hand side.  The branching
--	 * is needed, since vshr requires the shift to be an immediate
--	 * value.
--	 */
--	lsls		r5, r8, #28
--	bcc		neon_copy_64_u_b8
--	bpl		neon_copy_64_u_b16
--	vshr.u64	q0, q4, #40
--	vshr.u64	q1, q5, #40
--	vshr.u64	q2, q6, #40
--	vshr.u64	q3, q7, #40
--	b		neon_copy_64_unify
--neon_copy_64_u_b8:
--	vshr.u64	q0, q4, #56
--	vshr.u64	q1, q5, #56
--	vshr.u64	q2, q6, #56
--	vshr.u64	q3, q7, #56
--	b		neon_copy_64_unify
--neon_copy_64_u_b16:
--	vshr.u64	q0, q4, #48
--	vshr.u64	q1, q5, #48
--	vshr.u64	q2, q6, #48
--	vshr.u64	q3, q7, #48
--neon_copy_64_unify:
--	/*
--	 * Shift q4-q7 left by r8 bits to take the alignment into
--	 * account.
--	 */
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vshl.u64	q4, q8, q4
--	vshl.u64	q5, q8, q5
--	vshl.u64	q6, q8, q6
--	vshl.u64	q7, q8, q7
--#else
--	vshl.u64	q4, q4, q8
--	vshl.u64	q5, q5, q8
--	vshl.u64	q6, q6, q8
--	vshl.u64	q7, q7, q8
--#endif
--	/*
--	 * The data in s14 will be needed for the next loop iteration.  Move
--	 * that to r5.
--	 */
--	vmov		r5, s14
--	/* We'll vorr the shifted data with the data that needs to move back.*/
--	vorr		d9, d9, d0
--	/* Copy the data from the previous loop into s14.*/
--	vmov		s14, r4
--	vorr		d10, d10, d1
--	vorr		d11, d11, d2
--	vorr		d12, d12, d3
--	vorr		d13, d13, d4
--	vorr		d14, d14, d5
--	vorr		d15, d15, d6
--	vorr		d8, d8, d7
--	subs		r12, r12, #1
--	pld		[r1, #0]
--	pld		[r1, #128]
--	/* Save off the r5 data into r4 for the next iteration.*/
--	mov		r4, r5
--	vst1.32		{q4, q5}, [r0]!
--	vst1.32		{q6, q7}, [r0]!
--	pld		[r0, #0]
--	pld		[r0, #128]
--	bne		neon_copy_64_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vldmia		sp!, {q5-q8}
--	vldmia		sp!, {q4}
--#else
--	vpop		{q5-q8}
--	vpop		{q4}
--#endif
--	cmp		r2, #32
--	bge		neon_copy_32_u
--	b		neon_copy_finish_u
--neon_copy_32_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vstmdb		sp!, {q4}
--#else
--	vpush		{q4}
--#endif
--	vdup.u32	q4, r8
--	mov		r12, r2, lsr #5
--	and		r2, r2, #0x1f
--neon_copy_32_u_loop:
--	vld1.32		{q0, q1}, [r1]!
--	lsls		r5, r8, #28
--	bcc		neon_copy_32_u_b8
--	bpl		neon_copy_32_u_b16
--	vshr.u64	q2, q0, #40
--	vshr.u64	q3, q1, #40
--	b		neon_copy_32_unify
--neon_copy_32_u_b8:
--	vshr.u64	q2, q0, #56
--	vshr.u64	q3, q1, #56
--	b		neon_copy_32_unify
--neon_copy_32_u_b16:
--	vshr.u64	q2, q0, #48
--	vshr.u64	q3, q1, #48
--neon_copy_32_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vshl.u64	q0, q4, q0
--	vshl.u64	q1, q4, q1
--#else
--	vshl.u64	q0, q0, q4
--	vshl.u64	q1, q1, q4
--#endif
--	vmov		r5, s14
--	vorr		d1, d1, d4
--	vmov		s14, r4
--	vorr		d2, d2, d5
--	vorr		d3, d3, d6
--	vorr		d0, d0, d7
--	subs		r12, r12, #1
--	pld		[r1, #0]
--	mov		r4, r5
--	vst1.32		{q0, q1}, [r0]!
--	bne		neon_copy_32_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vldmia		sp!, {q4}
--#else
--	vpop		{q4}
--#endif
--neon_copy_finish_u:
--neon_copy_16_u:
--	movs		r12, r2, lsr #4
--	beq		neon_copy_8_u
--	vdup.u32	q2, r8
--	and		r2, r2, #0xf
--neon_copy_16_u_loop:
--	vld1.32		{q0}, [r1]!
--	lsls		r5, r8, #28
--	bcc		neon_copy_16_u_b8
--	bpl		neon_copy_16_u_b16
--	vshr.u64	q1, q0, #40
--	b		neon_copy_16_unify
--neon_copy_16_u_b8:
--	vshr.u64	q1, q0, #56
--	b		neon_copy_16_unify
--neon_copy_16_u_b16:
--	vshr.u64	q1, q0, #48
--neon_copy_16_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vshl.u64	q0, q2, q0
--#else
--	vshl.u64	q0, q0, q2
--#endif
--	vmov		r5, s6
--	vorr		d1, d1, d2
--	vmov		s6, r4
--	vorr		d0, d0, d3
--	subs		r12, r12, #1
--	mov		r4, r5
--	vst1.32		{q0}, [r0]!
--	bne		neon_copy_16_u_loop
--neon_copy_8_u:
--	cmp		r2, #8
--	blt		neon_copy_4_u
--	ldm		r1!, {r6-r7}
--	subs		r2, r2, #8
--	orr		r4, r4, r6, lsl r8
--	mov		r5, r6, lsr r9
--	orr		r5, r5, r7, lsl r8
--	stm		r0!, {r4-r5}
--	mov		r4, r7, lsr r9
--neon_copy_4_u:
--	cmp		r2, #4
--	blt		neon_copy_last_bits_u
--	ldr		r5, [r1], #4
--	subs		r2, r2, #4
--	orr		r4, r4, r5, lsl r8
--	str		r4, [r0], #4
--	mov		r4, r5, lsr r9
--neon_copy_last_bits_u:
--	/*
--	 * Remember, r8 contains the size of the data in r4 in bits,
--	 * so to get to bytes we'll need to shift 3 places
--	 */
--	lsr		r8, r8, #0x3
--	/* Write out the bytes stored in r4.*/
--neon_copy_last_bits_u_loop:
--	strb		r4, [r0], #1
--	subs		r8, r8, #1
--	lsrne		r4, r4, #8
--	bne		neon_copy_last_bits_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	ldmia		sp!, {r6-r9}
--#else
--	pop		{r6-r9}
--#endif
--neon_copy_finish:
--	cmp		r2, #0
--	beq		neon_end
--	/*
--	 * This just copies the data from source to target one byte
--	 * at a time.  For some small values, this makes more sense.
--	 * Note that since this code copies data a byte at a time,
--	 * both the aligned and unaligned paths can use it.
--	 */
--neon_copy_finish_loop:
--	ldrb		r4, [r1], #1
--	subs		r2, r2, #1
--	strb		r4, [r0], #1
--	bne		neon_copy_finish_loop
--neon_end:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	ldmia		sp!, {r4-r5}
--	ldmia		sp!, {r0}
--#else
--	pop		{r4-r5}
--	pop		{r0}
--#endif
--	bx		lr
--
--	.endfunc
--	.end
-diff --git git/src/neon_memmove.S git/src/neon_memmove.S
-deleted file mode 100644
-index 1bfe597..0000000
---- git/src/neon_memmove.S
-+++ /dev/null
-@@ -1,939 +0,0 @@
--/***************************************************************************
-- Copyright (c) 2009, Code Aurora Forum. All rights reserved.
--
-- Redistribution and use in source and binary forms, with or without
-- modification, are permitted provided that the following conditions are met:
--     * Redistributions of source code must retain the above copyright
--       notice, this list of conditions and the following disclaimer.
--     * Redistributions in binary form must reproduce the above copyright
--       notice, this list of conditions and the following disclaimer in the
--       documentation and/or other materials provided with the distribution.
--     * Neither the name of Code Aurora nor the names of its contributors may
--       be used to endorse or promote products derived from this software
--       without specific prior written permission.
--
-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-- POSSIBILITY OF SUCH DAMAGE.
--  ***************************************************************************/
--
--/***************************************************************************
-- *  Neon memmove: Attempts to do a memmove with Neon registers if possible,
-- *     Inputs:
-- *        dest: The destination buffer
-- *        src: The source buffer
-- *        n: The size of the buffer to transfer
-- *     Outputs:
-- *
-- ***************************************************************************/
--
--/*
-- * General note:
-- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
-- * However, it looks like the 2006 CodeSourcery Assembler has issues generating
-- * the correct object code for VPOP, resulting in horrific stack crashes.
-- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
-- * and VPOP->VLDMIA.  We can revert this back once we update our toolchain.
-- *
-- * Also, VSHL swaps the source register and the shift-amount register
-- * around in 2006-q3.  I've coded this incorrectly so it turns out correct
-- * in the object code, but we'll need to undo that later...
-- */
--	.code 32
--	.align 4
--	.globl neon_memmove
--	.func
--
--neon_memmove:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	stmdb		sp!, {r0}
--#else
--	push		{r0}
--#endif
--
--	/*
--	 * The requirements for memmove state that the function should
--	 * operate as if data were being copied from the source to a
--	 * buffer, then to the destination.  This is to allow a user
--	 * to copy data from a source and target that overlap.
--	 *
--	 * We can't just do byte copies front-to-back automatically, since
--	 * there's a good chance we may have an overlap (why else would someone
--	 * intentionally use memmove then?).
--	 *
--	 * We'll break this into two parts.  Front-to-back, or back-to-front
--	 * copies.
--	 */
--neon_memmove_cmf:
--	cmp		r0, r1
--	blt		neon_front_to_back_copy
--	bgt		neon_back_to_front_copy
--	b		neon_memmove_done
--
--	/* #############################################################
--	 * Front to Back copy
--	 */
--neon_front_to_back_copy:
--	/*
--	 * For small copies, just do a quick memcpy.  We can do this for
--	 * front-to-back copies, aligned or unaligned, since we're only
--	 * doing 1 byte at a time...
--	 */
--	cmp		r2, #4
--	bgt		neon_f2b_gt4
--	cmp		r2, #0
--neon_f2b_smallcopy_loop:
--	beq		neon_memmove_done
--	ldrb		r12, [r1], #1
--	subs		r2, r2, #1
--	strb		r12, [r0], #1
--	b		neon_f2b_smallcopy_loop
--neon_f2b_gt4:
--	/* Preload what we can...*/
--	pld		[r0,#0]
--	pld		[r1,#0]	
--	/* The window size is in r3. */
--	sub		r3, r1, r0
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	stmdb		sp!, {r4-r6}
--#else
--	push		{r4-r6}
--#endif
--
--neon_f2b_check_align:
--	/* Check alignment. */
--	ands		r12, r0, #0x3
--	beq		neon_f2b_source_align_check
--	cmp		r12, #2
--	ldrb		r4, [r1], #1
--	ldrleb		r5, [r1], #1
--	ldrltb		r6, [r1], #1
--	rsb		r12, r12, #4
--	sub		r2, r2, r12
--	strb		r4, [r0], #1
--	strleb		r5, [r0], #1
--	strltb		r6, [r0], #1
--	
--neon_f2b_source_align_check:
--	ands		r12, r1, #0x3
--	bne		neon_f2b_nonaligned
--
--neon_f2b_try_16_align:
--	/* If we're >64, attempt to align on 16-bytes.  Smaller amounts
--	 * don't seem to be worth handling. */
--	cmp		r2, #64
--	blt		neon_f2b_align_route
--	/* This is where we try 16-byte alignment. */
--	ands		r12, r0, #0xf
--	beq		neon_f2b_align_route
--	rsb		r12, r12, #16
--neon_f2b_16_start:
--	sub		r2, r2, r12
--	lsrs		r5, r12, #2
--neon_f2b_align_16_4:
--	ldr		r4, [r1], #4
--	subs		r5, r5, #1
--	str		r4, [r0], #4
--	bne		neon_f2b_align_16_4
--neon_f2b_align_route:
--	/* #############################################################
--	 * Front to Back copy - aligned
--	 */
--	/*
--	 * Note that we can't just route based on the size in r2.  If that's
--	 * larger than the overlap window in r3, we could potentially
--	 * (and likely!) destroy data we're copying.
--	 */
--	cmp		r2, r3
--	movle		r12, r2
--	movgt		r12, r3
--	cmp		r12, #256
--	bge		neon_f2b_copy_128_a
--	cmp		r12, #64
--	bge		neon_f2b_copy_32_a
--	cmp		r12, #16
--	bge		neon_f2b_copy_16_a
--	cmp		r12, #8
--	bge		neon_f2b_copy_8_a
--	cmp		r12, #4
--	bge		neon_f2b_copy_4_a
--	b		neon_f2b_copy_1_a
--neon_f2b_copy_128_a:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vstmdb		sp!, {q4-q7}
--#else
--	vpush		{q4-q7}
--#endif
--	mov		r12, r2, lsr #7
--neon_f2b_copy_128_a_loop:
--	vld1.32		{q0,q1}, [r1]!
--	vld1.32		{q2,q3}, [r1]!
--	vld1.32		{q4,q5}, [r1]!
--	vld1.32		{q6,q7}, [r1]!
--	pld		[r1, #0]
--	pld		[r1, #128]
--	vst1.32		{q0,q1}, [r0]!
--	vst1.32		{q2,q3}, [r0]!
--	vst1.32		{q4,q5}, [r0]!
--	vst1.32		{q6,q7}, [r0]!
--	subs		r12, r12, #1
--	pld		[r0, #0]
--	pld		[r0, #128]
--	bne		neon_f2b_copy_128_a_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vldmia		sp!, {q4-q7}
--#else
--	vpop		{q4-q7}
--#endif
--	ands		r2, r2, #0x7f
--	beq		neon_f2b_finish
--	cmp		r2, #32
--	bge		neon_f2b_copy_32_a
--	b		neon_f2b_copy_finish_a
--neon_f2b_copy_32_a:
--	mov		r12, r2, lsr #5
--neon_f2b_copy_32_a_loop:
--	vld1.32		{q0,q1}, [r1]!
--	subs		r12, r12, #1
--	pld		[r1, #0]
--	vst1.32		{q0,q1}, [r0]!
--	bne		neon_f2b_copy_32_a_loop
--	ands		r2, r2, #0x1f
--	beq		neon_f2b_finish
--neon_f2b_copy_finish_a:
--neon_f2b_copy_16_a:
--	movs		r12, r2, lsr #4
--	beq		neon_f2b_copy_8_a
--neon_f2b_copy_16_a_loop:
--	vld1.32		{q0}, [r1]!
--	subs		r12, r12, #1
--	vst1.32		{q0}, [r0]!
--	bne		neon_f2b_copy_16_a_loop
--	ands		r2, r2, #0xf
--	beq		neon_f2b_finish
--neon_f2b_copy_8_a:
--	cmp		r2, #8
--	blt		neon_f2b_copy_4_a
--	ldm		r1!, {r4-r5}
--	subs		r2, r2, #8
--	stm		r0!, {r4-r5}
--neon_f2b_copy_4_a:
--	cmp		r2, #4
--	blt		neon_f2b_copy_1_a
--	ldr		r4, [r1], #4
--	subs		r2, r2, #4
--	str		r4, [r0], #4
--neon_f2b_copy_1_a:
--	cmp		r2, #0
--	beq		neon_f2b_finish
--neon_f2b_copy_1_a_loop:
--	ldrb		r12, [r1], #1
--	subs		r2, r2, #1
--	strb		r12, [r0], #1
--	bne		neon_f2b_copy_1_a_loop
--	b		neon_f2b_finish
--		
--	/* #############################################################
--	 * Front to Back copy - unaligned
--	 */
--neon_f2b_nonaligned:
--	/*
--	 * For sizes < 8, does it really make sense to do the whole shift
--	 * party?  Note that we DON'T want to call neon_f2b_copy_1_u,
--	 * since we'll end up trying to pop r8-r11, and we DON'T want
--	 * to do that...
--	 */
--	cmp		r2, #8
--	ble		neon_f2b_copy_1_a
--
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	stmdb		sp!, {r7-r9}
--#else
--	push		{r7-r9}
--#endif
--	cmp		r12, #2
--	ldrb		r4, [r1], #1
--	ldrleb		r5, [r1], #1
--	ldrltb		r6, [r1], #1
--	rsb		r8, r12, #4
--	sub		r2, r2, r8
--	lsl		r8, r8, #3
--	orrle		r4, r4, r5, lsl #8
--	orrlt		r4, r4, r6, lsl #16
--	rsb		r9, r8, #32
--	/*
--	 * r4  = overflow bits
--	 * r8 = # of bits we copied into the r4 register to align source.
--	 * r9 = 32 - r8
--	 * r12 = Index counter for each size, so we determine how many times
--	 *       the given size will go into r2, then count down that # of
--	 *       times in r12.
--	 */
--	cmp		r2, #64
--	blt		neon_f2b_unaligned_route
--	ands		r12, r0, #0xf
--	beq		neon_f2b_unaligned_route
--	cmp		r3, #4
--	blt		neon_f2b_unaligned_route
--	rsb		r12, r12, #16
--neon_f2b_16_start_u:
--	sub		r2, r2, r12
--	lsrs		r6, r12, #2
--neon_f2b_align_16_4_u:
--	ldr		r5, [r1], #4
--	subs		r6, r6, #1
--	orr		r4, r4, r5, lsl r8
--	str		r4, [r0], #4
--	mov		r4, r5, lsr r9
--	bne		neon_f2b_align_16_4_u
--neon_f2b_unaligned_route:
--	cmp		r2, r3
--	movle		r12, r2
--	movgt		r12, r3
--	cmp		r12, #256
--	bge		neon_f2b_copy_64_u
--	cmp		r12, #64
--	bge		neon_f2b_copy_32_u
--	cmp		r12, #16
--	bge		neon_f2b_copy_16_u
--	cmp		r12, #8
--	bge		neon_f2b_copy_8_u
--	cmp		r12, #4
--	bge		neon_f2b_copy_4_u
--	b		neon_f2b_last_bits_u
--neon_f2b_copy_64_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vstmdb		sp!, {q4}
--	vstmdb		sp!, {q5-q8}
--#else
--	vpush		{q4}
--	vpush		{q5-q8}
--#endif
--	vdup.u32	q8, r8
--	mov		r12, r2, lsr #6
--	and		r2, r2, #0x3f
--neon_f2b_copy_64_u_loop:
--	vld1.32		{q4, q5}, [r1]!
--	vld1.32		{q6, q7}, [r1]!
--	lsls		r5, r8, #28
--	bcc		neon_f2b_copy_64_u_b8
--	bpl		neon_f2b_copy_64_u_b16
--	vshr.u64	q0, q4, #40
--	vshr.u64	q1, q5, #40
--	vshr.u64	q2, q6, #40
--	vshr.u64	q3, q7, #40
--	b		neon_f2b_copy_64_unify
--neon_f2b_copy_64_u_b8:
--	vshr.u64	q0, q4, #56
--	vshr.u64	q1, q5, #56
--	vshr.u64	q2, q6, #56
--	vshr.u64	q3, q7, #56
--	b		neon_f2b_copy_64_unify
--neon_f2b_copy_64_u_b16:
--	vshr.u64	q0, q4, #48
--	vshr.u64	q1, q5, #48
--	vshr.u64	q2, q6, #48
--	vshr.u64	q3, q7, #48
--neon_f2b_copy_64_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vshl.u64	q4, q8, q4
--	vshl.u64	q5, q8, q5
--	vshl.u64	q6, q8, q6
--	vshl.u64	q7, q8, q7
--#else
--	vshl.u64	q4, q4, q8
--	vshl.u64	q5, q5, q8
--	vshl.u64	q6, q6, q8
--	vshl.u64	q7, q7, q8
--#endif
--	vmov		r5, s14
--	vorr		d9, d9, d0
--	vmov		s14, r4
--	vorr		d10, d10, d1
--	vorr		d11, d11, d2
--	vorr		d12, d12, d3
--	vorr		d13, d13, d4
--	vorr		d14, d14, d5
--	vorr		d15, d15, d6
--	vorr		d8, d8, d7
--	subs		r12, r12, #1
--	pld		[r1, #0]
--	pld		[r1, #128]
--	mov		r4, r5
--	vst1.32		{q4, q5}, [r0]!
--	vst1.32		{q6, q7}, [r0]!
--	pld		[r0, #0]
--	pld		[r0, #128]
--	bne		neon_f2b_copy_64_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vldmia		sp!, {q5-q8}
--	vldmia		sp!, {q4}
--#else
--	vpop		{q5-q8}
--	vpop		{q4}
--#endif
--	cmp		r2, #32
--	bge		neon_f2b_copy_32_u
--	b		neon_f2b_copy_finish_u
--neon_f2b_copy_32_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vstmdb		sp!, {q4}
--#else
--	vpush		{q4}
--#endif
--	vdup.u32	q4, r8
--	mov		r12, r2, lsr #5
--	and		r2, r2, #0x1f
--neon_f2b_copy_32_u_loop:
--	vld1.32		{q0, q1}, [r1]!
--	lsls		r5, r8, #28
--	bcc		neon_f2b_copy_32_u_b8
--	bpl		neon_f2b_copy_32_u_b16
--	vshr.u64	q2, q0, #40
--	vshr.u64	q3, q1, #40
--	b		neon_f2b_copy_32_unify
--neon_f2b_copy_32_u_b8:
--	vshr.u64	q2, q0, #56
--	vshr.u64	q3, q1, #56
--	b		neon_f2b_copy_32_unify
--neon_f2b_copy_32_u_b16:
--	vshr.u64	q2, q0, #48
--	vshr.u64	q3, q1, #48
--neon_f2b_copy_32_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vshl.u64	q0, q4, q0
--	vshl.u64	q1, q4, q1
--#else
--	vshl.u64	q0, q0, q4
--	vshl.u64	q1, q1, q4
--#endif
--	vmov		r5, s14
--	vorr		d1, d1, d4
--	vmov		s14, r4
--	vorr		d2, d2, d5
--	vorr		d3, d3, d6
--	vorr		d0, d0, d7
--	subs		r12, r12, #1
--	pld		[r1, #0]
--	mov		r4, r5
--	vst1.32		{q0, q1}, [r0]!
--	bne		neon_f2b_copy_32_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vldmia		sp!, {q4}
--#else
--	vpop		{q4}
--#endif
--neon_f2b_copy_finish_u:
--neon_f2b_copy_16_u:
--	movs		r12, r2, lsr #4
--	beq		neon_f2b_copy_8_u
--	vdup.u32	q2, r8
--	and		r2, r2, #0xf
--neon_f2b_copy_16_u_loop:
--	vld1.32		{q0}, [r1]!
--	lsls		r5, r8, #28
--	bcc		neon_f2b_copy_16_u_b8
--	bpl		neon_f2b_copy_16_u_b16
--	vshr.u64	q1, q0, #40
--	b		neon_f2b_copy_16_unify
--neon_f2b_copy_16_u_b8:
--	vshr.u64	q1, q0, #56
--	b		neon_f2b_copy_16_unify
--neon_f2b_copy_16_u_b16:
--	vshr.u64	q1, q0, #48
--neon_f2b_copy_16_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vshl.u64	q0, q2, q0
--#else
--	vshl.u64	q0, q0, q2
--#endif
--	vmov		r5, s6
--	vorr		d1, d1, d2
--	vmov		s6, r4
--	vorr		d0, d0, d3
--	subs		r12, r12, #1
--	mov		r4, r5
--	vst1.32		{q0}, [r0]!
--	bne		neon_f2b_copy_16_u_loop
--neon_f2b_copy_8_u:
--	cmp		r2, #8
--	blt		neon_f2b_copy_4_u
--	ldm		r1!, {r6-r7}
--	subs		r2, r2, #8
--	orr		r4, r4, r6, lsl r8
--	mov		r5, r6, lsr r9
--	orr		r5, r5, r7, lsl r8
--	stm		r0!, {r4-r5}
--	mov		r4, r7, lsr r9
--neon_f2b_copy_4_u:
--	cmp		r2, #4
--	blt		neon_f2b_last_bits_u
--	ldr		r5, [r1], #4
--	subs		r2, r2, #4
--	orr		r4, r4, r5, lsl r8
--	str		r4, [r0], #4
--	mov		r4, r5, lsr r9
--neon_f2b_last_bits_u:
--	lsr		r8, r8, #0x3
--neon_f2b_last_bits_u_loop:
--	strb		r4, [r0], #1
--	subs		r8, r8, #1
--	lsr		r4, r4, #8
--	bne		neon_f2b_last_bits_u_loop
--neon_f2b_copy_1_u:
--	cmp		r2, #0
--	beq		neon_f2b_finish_u
--neon_f2b_copy_1_u_loop:
--	ldrb		r12, [r1], #1
--	subs		r2, r2, #1
--	strb		r12, [r0], #1
--	bne		neon_f2b_copy_1_u_loop
--neon_f2b_finish_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	ldmia		sp!, {r7-r9}
--#else
--	pop		{r7-r9}
--#endif
--	/* #############################################################
--	 * Front to Back copy - finish
--	 */
--neon_f2b_finish:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	ldmia		sp!, {r4-r6}
--#else
--	pop		{r4-r6}
--#endif
--	b		neon_memmove_done
--
--	/* #############################################################
--	 * Back to Front copy
--	 */
--neon_back_to_front_copy:
--	/*
--	 * Here, we'll want to shift to the end of the buffers.  This
--	 * actually points us one past where we need to go, but since
--	 * we'll pre-decrement throughout, this will be fine.
--	 */
--	add		r0, r0, r2
--	add		r1, r1, r2
--	cmp		r2, #4
--	bgt		neon_b2f_gt4
--	cmp		r2, #0
--neon_b2f_smallcopy_loop:
--	beq		neon_memmove_done
--	ldrb		r12, [r1, #-1]!
--	subs		r2, r2, #1
--	strb		r12, [r0, #-1]!
--	b		neon_b2f_smallcopy_loop
--neon_b2f_gt4:
--	pld		[r0, #0]
--	pld		[r1, #0]
--	/*
--	 * The minimum of the overlap window size and the copy size
--	 * is in r3.
--	 */
--	sub		r3, r0, r1
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	stmdb		sp!, {r4-r5}
--#else
--	push		{r4-r5}
--#endif
--
--	/*
--	 * Check alignment.  Since we'll pre-decrement as we step thru, we'll
--	 * need to make sure we're on word-alignment.
--	 */
--neon_b2f_check_align:
--	ands		r12, r0, #0x3
--	beq		neon_b2f_source_align_check
--	sub		r2, r2, r12
--neon_b2f_shift_align:
--	ldrb		r4, [r1, #-1]!
--	subs		r12, r12, #1
--	strb		r4, [r0, #-1]!
--	bne		neon_b2f_shift_align
--neon_b2f_source_align_check:
--	ands		r4, r1, #0x3
--	bne		neon_b2f_nonaligned
--	
--neon_b2f_try_16_align:
--	/* If we're >64, attempt to align on 16-bytes.  Smaller amounts
--	 * don't seem to be worth handling. */
--	cmp		r2, #64
--	blt		neon_b2f_align_route
--	ands		r12, r0, #0xf
--	beq		neon_b2f_align_route
--	/* In this case, r12 has the number of bytes to roll backward. */
--neon_b2f_16_start:
--	sub		r2, r2, r12
--	lsrs		r5, r12, #2
--neon_b2f_align_16_4:
--	ldr		r4, [r1, #-4]!
--	subs		r5, r5, #1
--	str		r4, [r0, #-4]!
--	bne		neon_b2f_align_16_4
--neon_b2f_align_route:
--	/*
--	 * #############################################################
--	 * Back to Front copy - aligned
--	 */
--	cmp		r2, r3
--	movle		r12, r2
--	movgt		r12, r3
--	cmp		r12, #256
--	bge		neon_b2f_copy_128_a
--	cmp		r12, #64
--	bge		neon_b2f_copy_32_a
--	cmp		r12, #8
--	bge		neon_b2f_copy_8_a
--	cmp		r12, #4
--	bge		neon_b2f_copy_4_a
--	b		neon_b2f_copy_1_a
--neon_b2f_copy_128_a:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vstmdb		sp!, {q4-q7}
--#else
--	vpush		{q4-q7}
--#endif
--	movs		r12, r2, lsr #7
--	/*
--	 * This irks me.  There MUST be a better way to read these in and
--	 * scan the register backward instead of making it go forward.  Then
--	 * we need to do two subtractions...
--	 */
--neon_b2f_copy_128_a_loop:
--	sub		r1, r1, #128
--	sub		r0, r0, #128
--	vld1.32		{q0, q1}, [r1]!
--	vld1.32		{q2, q3}, [r1]!
--	vld1.32		{q4, q5}, [r1]!
--	vld1.32		{q6, q7}, [r1]!
--	pld		[r1, #-128]
--	pld		[r1, #-256]
--	vst1.32		{q0, q1}, [r0]!
--	vst1.32		{q2, q3}, [r0]!
--	vst1.32		{q4, q5}, [r0]!
--	vst1.32		{q6, q7}, [r0]!
--	subs		r12, r12, #1
--	pld		[r0, #-128]
--	pld		[r0, #-256]
--	sub		r1, r1, #128
--	sub		r0, r0, #128
--	bne		neon_b2f_copy_128_a_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vldmia		sp!, {q4-q7}
--#else
--	vpop		{q4-q7}
--#endif
--	ands		r2, r2, #0x7f
--	beq		neon_b2f_finish
--	cmp		r2, #32
--	bge		neon_b2f_copy_32_a
--	b		neon_b2f_copy_finish_a
--neon_b2f_copy_32_a:
--	mov		r12, r2, lsr #5
--neon_b2f_copy_32_a_loop:
--	sub		r1, r1, #32
--	sub		r0, r0, #32
--	vld1.32		{q0,q1}, [r1]
--	subs		r12, r12, #1
--	vst1.32		{q0,q1}, [r0]
--	pld		[r1, #0]
--	bne		neon_b2f_copy_32_a_loop
--	ands		r2, r2, #0x1f
--	beq		neon_b2f_finish
--neon_b2f_copy_finish_a:
--neon_b2f_copy_8_a:
--	movs		r12, r2, lsr #0x3
--	beq		neon_b2f_copy_4_a
--neon_b2f_copy_8_a_loop:
--	ldmdb		r1!, {r4-r5}
--	subs		r12, r12, #1
--	stmdb		r0!, {r4-r5}
--	bne		neon_b2f_copy_8_a_loop
--	and		r2, r2, #0x7
--neon_b2f_copy_4_a:
--	movs		r12, r2, lsr #0x2
--	beq		neon_b2f_copy_1_a
--	and		r2, r2, #0x3
--neon_b2f_copy_4_a_loop:
--	ldr		r4, [r1, #-4]!
--	subs		r12, r12, #1
--	str		r4, [r0, #-4]!
--	bne		neon_b2f_copy_4_a_loop
--neon_b2f_copy_1_a:
--	cmp		r2, #0
--	beq		neon_b2f_finish
--neon_b2f_copy_1_a_loop:
--	ldrb		r12, [r1, #-1]!
--	subs		r2, r2, #1
--	strb		r12, [r0, #-1]!
--	bne		neon_b2f_copy_1_a_loop
--
--	/* #############################################################
--	 * Back to Front copy - unaligned
--	 */
--neon_b2f_nonaligned:
--	/*
--	 * For sizes < 8, does it really make sense to do the whole shift
--	 * party?
--	 */
--	cmp		r2, #8
--	ble		neon_b2f_copy_1_a
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	stmdb		sp!, {r6-r11}
--#else
--	push		{r6-r11}
--#endif
--	/*
--	 * r3 = max window size
--	 * r4 = overflow bytes
--	 * r5 = bytes we're reading into
--	 * r6 = # bytes we're off.
--	 * r10 = copy of r6
--	 */
--	and		r6, r1, #0x3
--	eor		r4, r4, r4
--	mov		r10, r6
--neon_b2f_realign:
--	ldrb		r5, [r1, #-1]!
--	subs		r6, r6, #1
--	orr		r4, r5, r4, lsl #8
--	bne		neon_b2f_realign
--	/*
--	 * r10 = # of bits we copied into the r4 register to align source.
--	 * r11 = 32 - r10
--	 * r12 = Index counter for each size, so we determine how many times
--	 *       the given size will go into r2, then count down that # of
--	 *       times in r12.
--	 */
--	sub		r2, r2, r10
--	lsl		r10, r10, #0x3
--	rsb		r11, r10, #32
--
--	cmp		r2, r3
--	movle		r12, r2
--	movgt		r12, r3
--	cmp		r12, #256
--	bge		neon_b2f_copy_64_u
--	cmp		r12, #64
--	bge		neon_b2f_copy_32_u
--	cmp		r12, #8
--	bge		neon_b2f_copy_8_u
--	cmp		r12, #4
--	bge		neon_b2f_copy_4_u
--	b		neon_b2f_last_bits_u
--neon_b2f_copy_64_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vstmdb		sp!, {q4,q5}
--	vstmdb		sp!, {q6-q8}
--#else
--	vpush		{q4,q5}
--	vpush		{q6-q8}
--#endif
--	add		r7, r11, #32
--	movs		r12, r2, lsr #6
--	vdup.u32	q8, r7
--neon_b2f_copy_64_u_loop:
--	sub		r1, r1, #64
--	sub		r0, r0, #64
--	vld1.32		{q0, q1}, [r1]!
--	vld1.32		{q2, q3}, [r1]
--	sub		r1, r1, #32
--	vmov		q4, q0
--	vmov		q5, q1
--	vmov		q6, q2
--	vmov		q7, q3
--	vmov		r5, s0
--	mov		r4, r4, lsl r11
--	lsls		r6, r10, #28
--	bcc		neon_b2f_copy_64_u_b8
--	bpl		neon_b2f_copy_64_u_b16
--	vshr.u64	q0, q0, #24
--	vshr.u64	q1, q1, #24
--	vshr.u64	q2, q2, #24
--	vshr.u64	q3, q3, #24
--	b		neon_b2f_copy_64_unify
--neon_b2f_copy_64_u_b8:
--	vshr.u64	q0, q0, #8
--	vshr.u64	q1, q1, #8
--	vshr.u64	q2, q2, #8
--	vshr.u64	q3, q3, #8
--	b		neon_b2f_copy_64_unify
--neon_b2f_copy_64_u_b16:
--	vshr.u64	q0, q0, #16
--	vshr.u64	q1, q1, #16
--	vshr.u64	q2, q2, #16
--	vshr.u64	q3, q3, #16
--neon_b2f_copy_64_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vshl.u64	q4, q8, q4
--	vshl.u64	q5, q8, q5
--	vshl.u64	q6, q8, q6
--	vshl.u64	q7, q8, q7
--#else
--	vshl.u64	q4, q4, q8
--	vshl.u64	q5, q5, q8
--	vshl.u64	q6, q6, q8
--	vshl.u64	q7, q7, q8
--#endif
--	vmov		s17, r4
--	vorr		d7, d7, d8
--	vorr		d6, d6, d15
--	vorr		d5, d5, d14
--	vorr		d4, d4, d13
--	vorr		d3, d3, d12
--	vorr		d2, d2, d11
--	vorr		d1, d1, d10
--	vorr		d0, d0, d9
--	mov		r4, r5, lsl r11
--	subs		r12, r12, #1
--	lsr		r4, r4, r11
--	vst1.32		{q0, q1}, [r0]!
--	vst1.32		{q2, q3}, [r0]
--	pld		[r1, #0]
--	sub		r0, r0, #32
--	bne		neon_b2f_copy_64_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vldmia		sp!, {q6-q8}
--	vldmia		sp!, {q4,q5}
--#else
--	vpop		{q6-q8}
--	vpop		{q4,q5}
--#endif
--	ands		r2, r2, #0x3f
--	cmp		r2, #32
--	bge		neon_b2f_copy_32_u
--	b		neon_b2f_copy_finish_u
--neon_b2f_copy_32_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vstmdb		sp!, {q4}
--#else
--	vpush		{q4}
--#endif
--	add		r7, r11, #32
--	movs		r12, r2, lsr #5
--	vdup.u32	q4, r7
--	and		r2, r2, #0x1f
--neon_b2f_copy_32_u_loop:
--	sub		r1, r1, #32
--	sub		r0, r0, #32
--	vld1.32		{q0, q1}, [r1]
--	vmov		q2, q0
--	vmov		q3, q1
--	vmov		r5, s0
--	mov		r4, r4, lsl r11
--	lsls		r6, r10, #28
--	bcc		neon_b2f_copy_32_u_b8
--	bpl		neon_b2f_copy_32_u_b16
--	vshr.u64	q0, q0, #24
--	vshr.u64	q1, q1, #24
--	b		neon_b2f_copy_32_unify
--neon_b2f_copy_32_u_b8:
--	vshr.u64	q0, q0, #8
--	vshr.u64	q1, q1, #8
--	b		neon_b2f_copy_32_unify
--neon_b2f_copy_32_u_b16:
--	vshr.u64	q0, q0, #16
--	vshr.u64	q1, q1, #16
--neon_b2f_copy_32_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vshl.u64	q2, q4, q2
--	vshl.u64	q3, q4, q3
--#else
--	vshl.u64	q2, q2, q4
--	vshl.u64	q3, q3, q4
--#endif
--	vmov		s9, r4
--	vorr		d3, d3, d4
--	vorr		d2, d2, d7
--	vorr		d1, d1, d6
--	vorr		d0, d0, d5
--	mov		r4, r5, lsl r11
--	subs		r12, r12, #1
--	lsr		r4, r4, r11
--	vst1.32		{q0, q1}, [r0]
--	pld		[r1, #0]
--	bne		neon_b2f_copy_32_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	vldmia		sp!, {q4}
--#else
--	vpop		{q4}
--#endif
--neon_b2f_copy_finish_u:
--neon_b2f_copy_8_u:
--	movs		r12, r2, lsr #0x3
--	beq		neon_b2f_copy_4_u
--	mov		r5, r4, lsl r11
--neon_b2f_copy_8_u_loop:
--	ldmdb		r1!, {r6-r7}
--	subs		r12, r12, #1
--	orr		r5, r5, r7, lsr r10
--	mov		r4, r7, lsl r11
--	orr		r4, r4, r6, lsr r10
--	stmdb		r0!, {r4-r5}
--	mov		r4, r6, lsl r11
--	lsr		r4, r4, r11
--	mov		r5, r4, lsl r11
--	bne		neon_b2f_copy_8_u_loop
--	ands		r2, r2, #0x7
--neon_b2f_copy_4_u:
--	movs		r12, r2, lsr #0x2
--	beq		neon_b2f_last_bits_u
--	mov		r5, r4, lsl r11
--neon_b2f_copy_4_u_loop:
--	ldr		r6, [r1, #-4]!
--	subs		r12, r12, #1
--	orr		r5, r5, r6, lsr r10
--	str		r5, [r0, #-4]!
--	mov		r4, r6, lsl r11
--	lsr		r4, r4, r11
--	mov		r5, r4, lsl r11
--	bne		neon_b2f_copy_4_u_loop
--	and		r2, r2, #0x3
--neon_b2f_last_bits_u:
--neon_b2f_last_bits_u_loop:
--	subs		r10, r10, #8
--	mov		r5, r4, lsr r10
--	strb		r5, [r0, #-1]!
--	bne		neon_b2f_last_bits_u_loop
--neon_b2f_copy_1_u:
--	cmp		r2, #0
--	beq		neon_b2f_finish_u
--neon_b2f_copy_1_u_loop:
--	ldrb		r12, [r1, #-1]!
--	subs		r2, r2, #1
--	strb		r12, [r0, #-1]!
--	bne		neon_b2f_copy_1_u_loop
--neon_b2f_finish_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	ldmia		sp!, {r6-r11}
--#else
--	pop		{r6-r11}
--#endif
--
--neon_b2f_finish:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	ldmia		sp!, {r4-r5}
--#else
--	pop		{r4-r5}
--#endif
--
--neon_memmove_done:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
--	ldmia		sp!, {r0}
--#else
--	pop		{r0}
--#endif
--	bx		lr
--
--	.endfunc
--	.end
-diff --git git/src/neon_memsets.c git/src/neon_memsets.c
-deleted file mode 100755
-index 740fc1e..0000000
---- git/src/neon_memsets.c
-+++ /dev/null
-@@ -1,169 +0,0 @@
--/* neon_memsets.c
-- *
-- * Copyright (c) 2009, Code Aurora Forum. All rights reserved.
-- *
-- * Redistribution and use in source and binary forms, with or without
-- * modification, are permitted provided that the following conditions are met:
-- *     * Redistributions of source code must retain the above copyright
-- *       notice, this list of conditions and the following disclaimer.
-- *     * Redistributions in binary form must reproduce the above copyright
-- *       notice, this list of conditions and the following disclaimer in the
-- *       documentation and/or other materials provided with the distribution.
-- *     * Neither the name of Code Aurora nor
-- *       the names of its contributors may be used to endorse or promote
-- *       products derived from this software without specific prior written
-- *       permission.
-- *
-- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-- * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-- * NON-INFRINGEMENT ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-- */
--
--#include "msm-swblits.h"
--
--void memset16(uint16_t dst[], uint16_t value, int count)
--{
--    if (count <= 0)
--        return;
--
--    asm volatile(
--                 "       pld        [%[dst], #0]                           \n"
--                 "       cmp        %[count], #4                           \n"
--                 "       blt        6f                                     \n"
--                 "       tst        %[dst], #0x3                           \n"
--                 "       strneh     %[value], [%[dst]], #2                 \n"
--                 "       subne      %[count], %[count], #1                 \n"
--                 "       vdup.u16   q8, %[value]                           \n"
--                 "       vmov       q9, q8                                 \n"
--                 "       cmp        %[count], #64                          \n"
--                 "       bge        0f                                     \n"
--                 "       cmp        %[count], #32                          \n"
--                 "       bge        2f                                     \n"
--                 "       cmp        %[count], #16                          \n"
--                 "       bge        3f                                     \n"
--                 "       cmp        %[count], #8                           \n"
--                 "       bge        4f                                     \n"
--                 "       b          5f                                     \n"
--                 "0:                                                       \n"
--                 "       mov        r12, %[count], lsr #6                  \n"
--                 "1:                                                       \n"
--                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
--                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
--                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
--                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
--                 "       subs       r12, r12, #1                           \n"
--                 "       bne        1b                                     \n"
--                 "       ands       %[count], %[count], #0x3f              \n"
--                 "       beq        7f                                     \n"
--                 "2:                                                       \n"
--                 "       cmp        %[count], #32                          \n"
--                 "       blt        3f                                     \n"
--                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
--                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
--                 "       subs       %[count], %[count], #32                \n"
--                 "       beq        7f                                     \n"
--                 "3:                                                       \n"
--                 "       cmp        %[count], #16                          \n"
--                 "       blt        4f                                     \n"
--                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
--                 "       subs       %[count], %[count], #16                \n"
--                 "       beq        7f                                     \n"
--                 "4:                                                       \n"
--                 "       cmp        %[count], #8                           \n"
--                 "       blt        5f                                     \n"
--                 "       vst1.16    {q8}, [%[dst]]!                        \n"
--                 "       subs       %[count], %[count], #8                 \n"
--                 "       beq        7f                                     \n"
--                 "5:                                                       \n"
--                 "       cmp        %[count], #4                           \n"
--                 "       blt        6f                                     \n"
--                 "       vst1.16    {d16}, [%[dst]]!                       \n"
--                 "       subs       %[count], %[count], #4                 \n"
--                 "       beq        7f                                     \n"
--                 "6:                                                       \n"
--                 "       cmp        %[count], #0                           \n"
--                 "       blt        7f                                     \n"
--                 "       lsls       %[count], #31                          \n"
--                 "       strmih     %[value], [%[dst]], #2                 \n"
--                 "       strcsh     %[value], [%[dst]], #2                 \n"
--                 "       strcsh     %[value], [%[dst]], #2                 \n"
--                 "7:                                                       \n"
--                 // Clobbered input registers
--                 : [dst] "+r" (dst), [count] "+r" (count)
--                 // Unclobbered input
--                 : [value] "r" (value)
--                 // Clobbered registers
--                 : "q8", "q9", "r12", "cc", "memory"
--                 );
--}
--
--void memset32(uint32_t dst[], uint32_t value, int count)
--{
--    asm volatile(
--                 "       pld        [%[dst], #0]                           \n"
--                 "       cmp        %[count], #4                           \n"
--                 "       blt        5f                                     \n"
--                 "       vdup.u32   q8, %[value]                           \n"
--                 "       vmov       q9, q8                                 \n"
--                 "       cmp        %[count], #32                          \n"
--                 "       bge        0f                                     \n"
--                 "       cmp        %[count], #16                          \n"
--                 "       bge        2f                                     \n"
--                 "       cmp        %[count], #8                           \n"
--                 "       bge        3f                                     \n"
--                 "       b          4f                                     \n"
--                 "0:                                                       \n"
--                 "       mov        r12, %[count], lsr #5                  \n"
--                 "1:                                                       \n"
--                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
--                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
--                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
--                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
--                 "       pld        [%[dst], #0]                           \n"
--                 "       subs       r12, r12, #1                           \n"
--                 "       bne        1b                                     \n"
--                 "       ands       %[count], %[count], #0x1f              \n"
--                 "       beq        6f                                     \n"
--                 "2:                                                       \n"
--                 "       cmp        %[count], #16                          \n"
--                 "       blt        3f                                     \n"
--                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
--                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
--                 "       subs       %[count], %[count], #16                \n"
--                 "       beq        6f                                     \n"
--                 "3:                                                       \n"
--                 "       cmp        %[count], #8                           \n"
--                 "       blt        4f                                     \n"
--                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
--                 "       subs       %[count], %[count], #8                 \n"
--                 "       beq        6f                                     \n"
--                 "4:                                                       \n"
--                 "       cmp        %[count], #4                           \n"
--                 "       blt        5f                                     \n"
--                 "       vst1.32    {q8}, [%[dst]]!                        \n"
--                 "       subs       %[count], %[count], #4                 \n"
--                 "       beq        6f                                     \n"
--                 "5:                                                       \n"
--                 "       cmp        %[count], #0                           \n"
--                 "       beq        6f                                     \n"
--                 "       lsls       %[count], #31                          \n"
--                 "       strmi      %[value], [%[dst]], #4                 \n"
--                 "       strcs      %[value], [%[dst]], #4                 \n"
--                 "       strcs      %[value], [%[dst]], #4                 \n"
--                 "6: @end                                                  \n"
--                 // Clobbered input registers
--                 : [dst] "+r" (dst), [count] "+r" (count)
--                 // Unclobbered input
--                 : [value] "r" (value)
--                 // Clobbered registers
--                 : "q8", "q9", "r12", "cc", "memory"
--                 );
--}
diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch b/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch
deleted file mode 100644
index 97ad380e27..0000000000
--- a/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch
+++ /dev/null
@@ -1,36 +0,0 @@
-commit 18515a56822fcd9c0a71240edce97ea5623b0448
-Author: David Lanzend�rfer <david.lanzendoerfer@o2s.ch>
-Date:   Wed Feb 10 16:29:55 2010 +0100
-
-    Modify Makefile.am
-    Removed depencies for neon
-
-diff --git git/src/Makefile.am git/src/Makefile.am
-index 8ab1856..08da5a5 100755
---- a/src/Makefile.am
-+++ b/src/Makefile.am
-@@ -12,13 +12,7 @@ MSM_DRI_SRCS += msm-drm.c msm-dri2.c
- msm_drv_la_LIBADD += $(DRI2_LIBS)
- endif
- 
--NEON_CFLAGS=-march=armv7-a -mfpu=neon -mfloat-abi=softfp
--NEON_CCASFLAGS=$(NEON_CFLAGS) -mthumb-interwork
--NEON_ASFLAGS=-k -mcpu=cortex-a8 $(NEON_CCASFLAGS)
--
--AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ $(NEON_CFLAGS) -Wall -Werror
--AM_ASFLAGS = $(NEON_ASFLAGS)
--AM_CCASFLAGS = $(NEON_CCASFLAGS)
-+AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ -Wall -Werror
- 
- msm_drv_la_LTLIBRARIES = msm_drv.la
- msm_drv_la_LDFLAGS = -module -avoid-version
-@@ -37,9 +31,6 @@ msm_drv_la_SOURCES = \
- 	 msm-swfill.c \
- 	 msm-hwrender.c \
- 	 msm-pixmap.c \
--	 neon_memsets.c \
--	 neon_memcpy.S \
--	 neon_memmove.S \
- 	$(MSM_DRI_SRCS)
- 
- 
diff --git a/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch b/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch
deleted file mode 100644
index 90dd31f605..0000000000
--- a/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch
+++ /dev/null
@@ -1,116 +0,0 @@
-commit cc83ba5835d5b55347fd0c0775156493b0cf3a15
-Author: David Lanzend�rfer <david.lanzendoerfer@o2s.ch>
-Date:   Thu Feb 11 16:26:52 2010 +0100
-
-    Renaming variables for getting Xorg (xf86-video-msm) work
-    under linux-leviathan (htcdream):
-    cd src
-    sed 's/fixed_info/fix/' -i *.h
-    sed 's/fixed_info/fix/' -i *.c
-
-diff --git git/src/msm-dri.c git/src/msm-dri.c
-index a51d3bd..a74368b 100644
---- git/src/msm-dri.c
-+++ git/src/msm-dri.c
-@@ -151,10 +151,10 @@ MSMDRIScreenInit(ScreenPtr pScreen)
-    pDRIInfo->ddxDriverMinorVersion = 0;
-    pDRIInfo->ddxDriverPatchVersion = 0;
- 
--   pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fixed_info.smem_start;
-+   pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fix.smem_start;
- 
--   pDRIInfo->frameBufferSize = pMsm->fixed_info.smem_len;
--   pDRIInfo->frameBufferStride = pMsm->fixed_info.line_length;
-+   pDRIInfo->frameBufferSize = pMsm->fix.smem_len;
-+   pDRIInfo->frameBufferStride = pMsm->fix.line_length;
- 
-    /* FIXME: How many drawables can we do (should we do)? */
- 
-diff --git git/src/msm-driver.c git/src/msm-driver.c
-index 803197f..15378f8 100755
---- git/src/msm-driver.c
-+++ git/src/msm-driver.c
-@@ -399,7 +399,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
- 
-    /* Get the fixed info (par) structure */
- 
--   if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fixed_info)) {
-+   if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fix)) {
-       xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
- 		 "Unable to read hardware info from %s: %s\n",
- 		 dev, strerror(errno));
-@@ -410,7 +410,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
-    /* Parse the ID and figure out what version of the MDP and what
-     * panel ID we have */
- 
--   if (sscanf(pMsm->fixed_info.id, "msmfb%d_%x", &mdpver, &panelid) < 2) {
-+   if (sscanf(pMsm->fix.id, "msmfb%d_%x", &mdpver, &panelid) < 2) {
- 
-       xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
- 		 "Unable to determine the MDP and panel type\n");
-@@ -435,7 +435,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
-     * the fbdev driver to allocate memory.   In the mean time, we
-     * just reuse the framebuffer memory */
- 
--   pScrn->videoRam = pMsm->fixed_info.smem_len;
-+   pScrn->videoRam = pMsm->fix.smem_len;
- 
-    /* Get the current screen setting */
-    if (ioctl(pMsm->fd, FBIOGET_VSCREENINFO, &pMsm->mode_info)) {
-@@ -671,8 +671,8 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
-    /* The framebuffer driver should always report the line length,
-     * but in case it doesn't, we can calculate it ourselves */
- 
--   if (pMsm->fixed_info.line_length) {
--      pScrn->displayWidth = pMsm->fixed_info.line_length;
-+   if (pMsm->fix.line_length) {
-+      pScrn->displayWidth = pMsm->fix.line_length;
-    } else {
-       pScrn->displayWidth = pMsm->mode_info.xres_virtual *
- 	    pMsm->mode_info.bits_per_pixel / 8;
-@@ -811,7 +811,7 @@ MSMCloseScreen(int scrnIndex, ScreenPtr pScreen)
- #endif
- 
-    /* Unmap the framebuffer memory */
--   munmap(pMsm->fbmem, pMsm->fixed_info.smem_len);
-+   munmap(pMsm->fbmem, pMsm->fix.smem_len);
- 
-    pScreen->CloseScreen = pMsm->CloseScreen;
- 
-@@ -857,7 +857,7 @@ MSMScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
- #endif // defined (MSMFB_GET_PAGE_PROTECTION) && defined (MSMFB_SET_PAGE_PROTECTION)
- 
-    /* Map the framebuffer memory */
--   pMsm->fbmem = mmap(NULL, pMsm->fixed_info.smem_len,
-+   pMsm->fbmem = mmap(NULL, pMsm->fix.smem_len,
- 		      PROT_READ | PROT_WRITE, MAP_SHARED, pMsm->fd, 0);
- 
-    /* If we can't map the memory, then this is a short trip */
-diff --git git/src/msm-exa.c git/src/msm-exa.c
-index 301923f..ce16a93 100755
---- git/src/msm-exa.c
-+++ git/src/msm-exa.c
-@@ -740,8 +740,8 @@ MSMSetupExa(ScreenPtr pScreen)
-    pExa->flags = EXA_OFFSCREEN_PIXMAPS;
- 
-    pExa->offScreenBase =
--       (pMsm->fixed_info.line_length * pMsm->mode_info.yres);
--   pExa->memorySize = pMsm->fixed_info.smem_len;
-+       (pMsm->fix.line_length * pMsm->mode_info.yres);
-+   pExa->memorySize = pMsm->fix.smem_len;
- 
-    /* Align pixmap offsets along page boundaries */
-    pExa->pixmapOffsetAlign = 4096;
-diff --git git/src/msm.h git/src/msm.h
-index e1e2bc7..520d390 100755
---- git/src/msm.h
-+++ git/src/msm.h
-@@ -85,7 +85,7 @@ typedef struct _MSMRec
-    int fd;
- 
-    /* Fixed and var strutures from the framebuffer */
--   struct fb_fix_screeninfo fixed_info;
-+   struct fb_fix_screeninfo fix;
-    struct fb_var_screeninfo mode_info;
- 
-    /* Pointer to the mapped framebuffer memory */
author	Frans Meulenbroeks <fransmeulenbroeks@gmail.com>	2010-10-10 13:08:14 +0200
committer	Frans Meulenbroeks <fransmeulenbroeks@gmail.com>	2010-10-10 13:08:14 +0200
commit	067e6a9f69e91cf1d1fe46c972189791da1a8a7a (patch)
tree	5055c1acdf03f0eb1a26a2344ab0cb00f9cc475f /recipes/xorg-driver
parent	65abd0ee8631cad3fcd984a56bbcafb084cbbb54 (diff)
download	openembedded-067e6a9f69e91cf1d1fe46c972189791da1a8a7a.tar.gz