aboutsummaryrefslogtreecommitdiffstats
path: root/recipes/xorg-driver
diff options
context:
space:
mode:
authorFrans Meulenbroeks <fransmeulenbroeks@gmail.com>2010-10-10 13:08:14 +0200
committerFrans Meulenbroeks <fransmeulenbroeks@gmail.com>2010-10-10 13:08:14 +0200
commit067e6a9f69e91cf1d1fe46c972189791da1a8a7a (patch)
tree5055c1acdf03f0eb1a26a2344ab0cb00f9cc475f /recipes/xorg-driver
parent65abd0ee8631cad3fcd984a56bbcafb084cbbb54 (diff)
downloadopenembedded-067e6a9f69e91cf1d1fe46c972189791da1a8a7a.tar.gz
xorg-driver : moved unused files to obsolete dir
Signed-off-by: Frans Meulenbroeks <fransmeulenbroeks@gmail.com>
Diffstat (limited to 'recipes/xorg-driver')
-rw-r--r--recipes/xorg-driver/xf86-input-tslib/dontfloodevents006.patch22
-rw-r--r--recipes/xorg-driver/xf86-video-msm/no_neon.patch2901
-rw-r--r--recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch36
-rw-r--r--recipes/xorg-driver/xf86-video-msm/renaming_variables.patch116
4 files changed, 0 insertions, 3075 deletions
diff --git a/recipes/xorg-driver/xf86-input-tslib/dontfloodevents006.patch b/recipes/xorg-driver/xf86-input-tslib/dontfloodevents006.patch
deleted file mode 100644
index e989717d3b..0000000000
--- a/recipes/xorg-driver/xf86-input-tslib/dontfloodevents006.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-Index: xf86-input-tslib-0.0.6/src/tslib.c
-===================================================================
---- xf86-input-tslib-0.0.6.orig/src/tslib.c 2009-11-29 20:03:29.734794324 +0000
-+++ xf86-input-tslib-0.0.6/src/tslib.c 2009-11-29 20:29:24.066794215 +0000
-@@ -205,7 +205,7 @@
- */
- switch (priv->state) {
- case BUTTON_EMULATION_OFF :
-- if(priv->lastp != samp.pressure) {
-+ if(!!priv->lastp != !!samp.pressure) {
- priv->lastp = samp.pressure;
- xf86PostButtonEvent(local->dev, TRUE,
- 1, !!samp.pressure, 0, 2,
-@@ -512,7 +512,7 @@
- s = xf86CheckStrOption(dev->commonOptions, "path", NULL);
- if (!s)
- s = xf86CheckStrOption(dev->commonOptions, "Device", NULL);
--
-+
- priv->ts = ts_open(s, 1);
- xfree(s);
-
diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon.patch b/recipes/xorg-driver/xf86-video-msm/no_neon.patch
deleted file mode 100644
index c0aa92e76a..0000000000
--- a/recipes/xorg-driver/xf86-video-msm/no_neon.patch
+++ /dev/null
@@ -1,2901 +0,0 @@
-commit d8910bf773fbecf7cdea359d4b530a3672e27180
-Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
-Date: Wed Feb 10 16:18:39 2010 +0100
-
- Removed neon because its not available in our kerneÃl
- and so its causing trubble (Illegal instruction)
-
-diff --git git/src/msm-swblits.h git/src/msm-swblits.h
-index f89f00e..a40b24b 100755
---- git/src/msm-swblits.h
-+++ git/src/msm-swblits.h
-@@ -38,16 +38,6 @@
- #include <stdint.h>
- #include <stdlib.h>
-
--/* Neon intrinsics are part of the ARM or GCC compiler used. */
--/* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */
--#include <arm_neon.h>
--
--/* These are NEON-optimized functions linked to by various tests. */
--extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes);
--extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes);
--extern void memset16(uint16_t *dst, uint16_t value, int count);
--extern void memset32(uint32_t *dst, uint32_t value, int count);
--
- /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */
- #define BITS_PER_BYTE (8)
- #define BYTES_PER_16BPP_PIXEL (2)
-diff --git git/src/msm-swfill.c git/src/msm-swfill.c
-index 108fd94..3dd1ef2 100755
---- git/src/msm-swfill.c
-+++ git/src/msm-swfill.c
-@@ -212,7 +212,7 @@ memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int cou
- }
- }
-
--
-+/*
- static inline void
- memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
- {
-@@ -333,7 +333,7 @@ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
- // Quickly fill remaining pixels (up to 7).
- memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count);
- }
--
-+*/
-
- static inline void
- memset16_Test(uint16_t *dst, uint16_t src, int count)
-@@ -368,7 +368,8 @@ memset16_Test(uint16_t *dst, uint16_t src, int count)
-
- // Copy remaining pixels using Neon and non-Neon instructions.
- // NOTE: This assumes that dst is aligned optimally for Neon instructions.
-- memset16_AssumesNeonAlignment((void *) dst, src, count);
-+ //memset16_AssumesNeonAlignment((void *) dst, src, count);
-+ memset((void *) dst, src, count);
- }
- }
-
-@@ -435,12 +436,14 @@ swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dp
- if (w < 32) {
- // For narrow rectangles, block signals only once for the entire rectangles.
- BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
-- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
-+ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
-+ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
- UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
- }
- else {
- // For wider rectangles, block and unblock signals for every row.
-- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
- }
- }
-
-diff --git git/src/msm-swrender.c git/src/msm-swrender.c
-index a7a9abc..835dc03 100755
---- git/src/msm-swrender.c
-+++ git/src/msm-swrender.c
-@@ -214,160 +214,6 @@ swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src
- }
- }
- break;
-- case 7: if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
-- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
-- return TRUE;
-- } else {
-- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
-- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
-- return TRUE;
-- }
-- break;
-- case 8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
-- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
-- return TRUE;
-- }
-- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
-- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
-- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1;
-- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2;
-- return TRUE;
-- }
-- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
-- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
-- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
-- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
-- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1;
-- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2;
-- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3;
-- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4;
-- return TRUE;
-- }
-- else {
-- uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T);
-- uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT16_T);
-- uint16_t src3 = *(uint16_t *) (src+2*BYTES_PER_UINT16_T);
-- uint16_t src4 = *(uint16_t *) (src+3*BYTES_PER_UINT16_T);
-- uint16_t src5 = *(uint16_t *) (src+4*BYTES_PER_UINT16_T);
-- uint16_t src6 = *(uint16_t *) (src+5*BYTES_PER_UINT16_T);
-- uint16_t src7 = *(uint16_t *) (src+6*BYTES_PER_UINT16_T);
-- uint16_t src8 = *(uint16_t *) (src+7*BYTES_PER_UINT16_T);
-- *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1;
-- *(uint16_t *) (dst+1*BYTES_PER_UINT16_T) = src2;
-- *(uint16_t *) (dst+2*BYTES_PER_UINT16_T) = src3;
-- *(uint16_t *) (dst+3*BYTES_PER_UINT16_T) = src4;
-- *(uint16_t *) (dst+4*BYTES_PER_UINT16_T) = src5;
-- *(uint16_t *) (dst+5*BYTES_PER_UINT16_T) = src6;
-- *(uint16_t *) (dst+6*BYTES_PER_UINT16_T) = src7;
-- *(uint16_t *) (dst+7*BYTES_PER_UINT16_T) = src8;
-- return TRUE;
-- }
-- break;
-- case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
-- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
-- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
-- return TRUE;
-- }
-- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
-- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
-- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
-- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
-- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1;
-- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2;
-- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3;
-- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4;
-- return TRUE;
-- }
-- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
-- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
-- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
-- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
-- uint32_t src5 = *(uint32_t *) (src+4*BYTES_PER_UINT32_T);
-- uint32_t src6 = *(uint32_t *) (src+5*BYTES_PER_UINT32_T);
-- uint32_t src7 = *(uint32_t *) (src+6*BYTES_PER_UINT32_T);
-- uint32_t src8 = *(uint32_t *) (src+7*BYTES_PER_UINT32_T);
-- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1;
-- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2;
-- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3;
-- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4;
-- *(uint32_t *) (dst+4*BYTES_PER_UINT32_T) = src5;
-- *(uint32_t *) (dst+5*BYTES_PER_UINT32_T) = src6;
-- *(uint32_t *) (dst+6*BYTES_PER_UINT32_T) = src7;
-- *(uint32_t *) (dst+7*BYTES_PER_UINT32_T) = src8;
-- return TRUE;
-- }
-- else {
-- // Don't bother unrolling loops here, since that won't help for more than around 8 operations.
-- // Instead, just call multiple fixed functions.
-- if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
-- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
-- } else {
-- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
-- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
-- }
-- return TRUE;
-- }
-- break;
-- case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
-- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
-- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
-- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
-- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
-- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
-- return TRUE;
-- }
-- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
-- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
-- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
-- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
-- uint64_t src5 = *(uint64_t *) (src+4*BYTES_PER_UINT64_T);
-- uint64_t src6 = *(uint64_t *) (src+5*BYTES_PER_UINT64_T);
-- uint64_t src7 = *(uint64_t *) (src+6*BYTES_PER_UINT64_T);
-- uint64_t src8 = *(uint64_t *) (src+7*BYTES_PER_UINT64_T);
-- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1;
-- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2;
-- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3;
-- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4;
-- *(uint64_t *) (dst+4*BYTES_PER_UINT64_T) = src5;
-- *(uint64_t *) (dst+5*BYTES_PER_UINT64_T) = src6;
-- *(uint64_t *) (dst+6*BYTES_PER_UINT64_T) = src7;
-- *(uint64_t *) (dst+7*BYTES_PER_UINT64_T) = src8;
-- return TRUE;
-- }
-- break;
-- case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
-- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
-- uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T));
-- uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T));
-- uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T));
-- uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T));
-- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
-- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
-- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
-- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
-- vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5);
-- vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6);
-- vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7);
-- vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8);
-- return TRUE;
-- }
-- break;
- }
-
- return FALSE;
-@@ -519,427 +365,6 @@ swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *sr
- }
- return TRUE;
- break;
-- case 7: if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
-- } else {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
-- }
-- return TRUE;
-- break;
-- case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
-- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
-- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
-- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
-- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
-- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
-- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
-- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
-- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
-- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
-- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
-- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
-- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
-- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a;
-- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a;
-- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a;
-- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a;
-- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b;
-- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b;
-- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b;
-- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b;
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
-- uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
-- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-- uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-- uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
-- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-- uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-- *(uint16_t *) (dst+0*dpitch+0) = src1a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a;
-- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a;
-- *(uint16_t *) (dst+1*dpitch+0) = src1b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b;
-- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b;
-- return TRUE;
-- }
-- else {
-- uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
-- uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
-- uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
-- uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
-- uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
-- uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
-- uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
-- uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
-- uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
-- uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
-- uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
-- uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
-- uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
-- uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
-- uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
-- uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
-- *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a;
-- *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a;
-- *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a;
-- *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a;
-- *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a;
-- *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a;
-- *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a;
-- *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a;
-- *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b;
-- *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b;
-- *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b;
-- *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b;
-- *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b;
-- *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b;
-- *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b;
-- *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b;
-- return TRUE;
-- }
-- break;
-- case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
-- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
-- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
-- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
-- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
-- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
-- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
-- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
-- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
-- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a;
-- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a;
-- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
-- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
-- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b;
-- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b;
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) {
-- uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
-- uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
-- uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
-- uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
-- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
-- uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
-- uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
-- uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
-- uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
-- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
-- *(uint32_t *) (dst+0*dpitch+0) = src1a;
-- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a;
-- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a;
-- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a;
-- *(uint32_t *) (dst+1*dpitch+0) = src1b;
-- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b;
-- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b;
-- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b;
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
-- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
-- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
-- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
-- uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
-- uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
-- uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
-- uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
-- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
-- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
-- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
-- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
-- uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
-- uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
-- uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
-- uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
-- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a;
-- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a;
-- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a;
-- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a;
-- *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a;
-- *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a;
-- *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a;
-- *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a;
-- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b;
-- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b;
-- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b;
-- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b;
-- *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b;
-- *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b;
-- *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b;
-- *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b;
-- return TRUE;
-- }
-- else {
-- // Don't bother unrolling loops, since that won't help for more than around 8 operations.
-- // Instead, just call multiple fixed functions.
-- if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- } else {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
-- }
-- return TRUE;
-- }
-- break;
-- case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
-- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
-- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
-- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
-- if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
-- } else {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch);
-- }
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
-- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
-- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
-- uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T);
-- uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T);
-- uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T);
-- uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T);
-- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
-- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
-- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
-- uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T);
-- uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T);
-- uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T);
-- uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T);
-- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
-- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
-- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a;
-- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a;
-- *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T) = src5a;
-- *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T) = src6a;
-- *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T) = src7a;
-- *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T) = src8a;
-- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
-- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
-- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b;
-- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b;
-- *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T) = src5b;
-- *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T) = src6b;
-- *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T) = src7b;
-- *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T) = src8b;
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
-- if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
-- } else {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch);
-- }
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
-- if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
-- } else {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch);
-- }
-- return TRUE;
-- }
-- else {
-- if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- } else {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- }
-- return TRUE;
-- }
-- break;
-- case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
-- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
-- uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T));
-- uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T));
-- uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T));
-- uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
-- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
-- uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T));
-- uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T));
-- uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T));
-- uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T));
-- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4_T),src6a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+6*BYTES_PER_UINT32X4_T),src7a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+7*BYTES_PER_UINT32X4_T),src8a);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+4*BYTES_PER_UINT32X4_T),src5b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+5*BYTES_PER_UINT32X4_T),src6b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+6*BYTES_PER_UINT32X4_T),src7b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+7*BYTES_PER_UINT32X4_T),src8b);
-- return TRUE;
-- }//HERE
-- else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
-- if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
-- } else {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch);
-- }
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
-- if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
-- } else {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch);
-- }
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
-- if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
-- } else {
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch);
-- }
-- return TRUE;
-- }
-- break;
- }
-
- return FALSE;
-@@ -1161,484 +586,7 @@ swCopy16BppSmallFixedWidths4Rows_Unaligned(unsigned char *dst, unsigned char *sr
- }
- return TRUE;
- break;
-- case 7: if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
-- } else {
-- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
-- }
-- return TRUE;
-- break;
-- // TODO: Add more alignment checks for 8 pixel-wide cases for performance reasons?
-- // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,DOUBLE_WORD_ALIGNMENT_BYTE_SIZE/2)) and related half-aligned cases...
-- case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
-- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
-- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
-- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
-- uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
-- uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
-- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
-- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
-- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
-- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
-- *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c;
-- *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c;
-- *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d;
-- *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d;
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
-- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
-- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
-- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
-- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
-- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
-- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
-- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
-- uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
-- uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
-- uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
-- uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
-- uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
-- uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
-- uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
-- uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
-- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a;
-- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a;
-- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a;
-- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a;
-- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b;
-- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b;
-- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b;
-- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b;
-- *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c;
-- *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c;
-- *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c;
-- *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c;
-- *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d;
-- *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d;
-- *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d;
-- *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d;
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
-- uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
-- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-- uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-- uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
-- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-- uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-- uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
-- uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-- uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-- uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-- uint16_t src5c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-- uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
-- uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-- uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-- uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-- uint16_t src5d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-- *(uint16_t *) (dst+0*dpitch+0) = src1a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a;
-- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a;
-- *(uint16_t *) (dst+1*dpitch+0) = src1b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b;
-- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b;
-- *(uint16_t *) (dst+2*dpitch+0) = src1c;
-- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c;
-- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c;
-- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c;
-- *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c;
-- *(uint16_t *) (dst+3*dpitch+0) = src1d;
-- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d;
-- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d;
-- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d;
-- *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5d;
-- return TRUE;
-- }
-- else {
-- uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
-- uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
-- uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
-- uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
-- uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
-- uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
-- uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
-- uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
-- uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
-- uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
-- uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
-- uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
-- uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
-- uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
-- uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
-- uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
-- uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T);
-- uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T);
-- uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T);
-- uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T);
-- uint16_t src5c = *(uint16_t *) (src+2*spitch+4*BYTES_PER_UINT16_T);
-- uint16_t src6c = *(uint16_t *) (src+2*spitch+5*BYTES_PER_UINT16_T);
-- uint16_t src7c = *(uint16_t *) (src+2*spitch+6*BYTES_PER_UINT16_T);
-- uint16_t src8c = *(uint16_t *) (src+2*spitch+7*BYTES_PER_UINT16_T);
-- uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T);
-- uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T);
-- uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T);
-- uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T);
-- uint16_t src5d = *(uint16_t *) (src+3*spitch+4*BYTES_PER_UINT16_T);
-- uint16_t src6d = *(uint16_t *) (src+3*spitch+5*BYTES_PER_UINT16_T);
-- uint16_t src7d = *(uint16_t *) (src+3*spitch+6*BYTES_PER_UINT16_T);
-- uint16_t src8d = *(uint16_t *) (src+3*spitch+7*BYTES_PER_UINT16_T);
-- *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a;
-- *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a;
-- *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a;
-- *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a;
-- *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a;
-- *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a;
-- *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a;
-- *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a;
-- *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b;
-- *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b;
-- *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b;
-- *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b;
-- *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b;
-- *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b;
-- *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b;
-- *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b;
-- *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T) = src1c;
-- *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T) = src2c;
-- *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T) = src3c;
-- *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T) = src4c;
-- *(uint16_t *) (dst+2*dpitch+4*BYTES_PER_UINT16_T) = src5c;
-- *(uint16_t *) (dst+2*dpitch+5*BYTES_PER_UINT16_T) = src6c;
-- *(uint16_t *) (dst+2*dpitch+6*BYTES_PER_UINT16_T) = src7c;
-- *(uint16_t *) (dst+2*dpitch+7*BYTES_PER_UINT16_T) = src8c;
-- *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T) = src1d;
-- *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T) = src2d;
-- *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T) = src3d;
-- *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T) = src4d;
-- *(uint16_t *) (dst+3*dpitch+4*BYTES_PER_UINT16_T) = src5d;
-- *(uint16_t *) (dst+3*dpitch+5*BYTES_PER_UINT16_T) = src6d;
-- *(uint16_t *) (dst+3*dpitch+6*BYTES_PER_UINT16_T) = src7d;
-- *(uint16_t *) (dst+3*dpitch+7*BYTES_PER_UINT16_T) = src8d;
-- return TRUE;
-- }
-- break;
-- case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
-- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
-- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
-- vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
-- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
-- vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
-- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
-- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
-- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
-- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
-- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
-- uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
-- uint64_t src3c = *(uint64_t *) (src+2*spitch+2*BYTES_PER_UINT64_T);
-- uint64_t src4c = *(uint64_t *) (src+2*spitch+3*BYTES_PER_UINT64_T);
-- uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
-- uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
-- uint64_t src3d = *(uint64_t *) (src+3*spitch+2*BYTES_PER_UINT64_T);
-- uint64_t src4d = *(uint64_t *) (src+3*spitch+3*BYTES_PER_UINT64_T);
-- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
-- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
-- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a;
-- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a;
-- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
-- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
-- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b;
-- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b;
-- *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c;
-- *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c;
-- *(uint64_t *) (dst+2*dpitch+2*BYTES_PER_UINT64_T) = src3c;
-- *(uint64_t *) (dst+2*dpitch+3*BYTES_PER_UINT64_T) = src4c;
-- *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d;
-- *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d;
-- *(uint64_t *) (dst+3*dpitch+2*BYTES_PER_UINT64_T) = src3d;
-- *(uint64_t *) (dst+3*dpitch+3*BYTES_PER_UINT64_T) = src4d;
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
-- uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
-- uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
-- uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
-- uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
-- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
-- uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
-- uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
-- uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
-- uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
-- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
-- uint32_t src1c = *(uint32_t *) (src+2*spitch+0);
-- uint64_t src2c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
-- uint64_t src3c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
-- uint64_t src4c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
-- uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
-- uint32_t src1d = *(uint32_t *) (src+3*spitch+0);
-- uint64_t src2d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
-- uint64_t src3d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
-- uint64_t src4d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
-- uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
-- *(uint32_t *) (dst+0*dpitch+0) = src1a;
-- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a;
-- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a;
-- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a;
-- *(uint32_t *) (dst+1*dpitch+0) = src1b;
-- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b;
-- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b;
-- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b;
-- *(uint32_t *) (dst+2*dpitch+0) = src1c;
-- *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2c;
-- *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3c;
-- *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4c;
-- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5c;
-- *(uint32_t *) (dst+3*dpitch+0) = src1d;
-- *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2d;
-- *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3d;
-- *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4d;
-- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5d;
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
-- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
-- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
-- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
-- uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
-- uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
-- uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
-- uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
-- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
-- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
-- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
-- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
-- uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
-- uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
-- uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
-- uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
-- uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
-- uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
-- uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
-- uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
-- uint32_t src5c = *(uint32_t *) (src+2*spitch+4*BYTES_PER_UINT32_T);
-- uint32_t src6c = *(uint32_t *) (src+2*spitch+5*BYTES_PER_UINT32_T);
-- uint32_t src7c = *(uint32_t *) (src+2*spitch+6*BYTES_PER_UINT32_T);
-- uint32_t src8c = *(uint32_t *) (src+2*spitch+7*BYTES_PER_UINT32_T);
-- uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
-- uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
-- uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
-- uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
-- uint32_t src5d = *(uint32_t *) (src+3*spitch+4*BYTES_PER_UINT32_T);
-- uint32_t src6d = *(uint32_t *) (src+3*spitch+5*BYTES_PER_UINT32_T);
-- uint32_t src7d = *(uint32_t *) (src+3*spitch+6*BYTES_PER_UINT32_T);
-- uint32_t src8d = *(uint32_t *) (src+3*spitch+7*BYTES_PER_UINT32_T);
-- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a;
-- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a;
-- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a;
-- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a;
-- *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a;
-- *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a;
-- *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a;
-- *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a;
-- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b;
-- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b;
-- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b;
-- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b;
-- *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b;
-- *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b;
-- *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b;
-- *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b;
-- *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c;
-- *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c;
-- *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c;
-- *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c;
-- *(uint32_t *) (dst+2*dpitch+4*BYTES_PER_UINT32_T) = src5c;
-- *(uint32_t *) (dst+2*dpitch+5*BYTES_PER_UINT32_T) = src6c;
-- *(uint32_t *) (dst+2*dpitch+6*BYTES_PER_UINT32_T) = src7c;
-- *(uint32_t *) (dst+2*dpitch+7*BYTES_PER_UINT32_T) = src8c;
-- *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d;
-- *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d;
-- *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d;
-- *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d;
-- *(uint32_t *) (dst+3*dpitch+4*BYTES_PER_UINT32_T) = src5d;
-- *(uint32_t *) (dst+3*dpitch+5*BYTES_PER_UINT32_T) = src6d;
-- *(uint32_t *) (dst+3*dpitch+6*BYTES_PER_UINT32_T) = src7d;
-- *(uint32_t *) (dst+3*dpitch+7*BYTES_PER_UINT32_T) = src8d;
-- return TRUE;
-- }
-- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
-- uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
-- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-- uint32_t src6a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
-- uint32_t src7a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
-- uint32_t src8a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
-- uint16_t src9a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
-- uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
-- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-- uint32_t src6b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
-- uint32_t src7b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
-- uint32_t src8b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
-- uint16_t src9b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
-- uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
-- uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-- uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-- uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-- uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-- uint32_t src6c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
-- uint32_t src7c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
-- uint32_t src8c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
-- uint16_t src9c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
-- uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
-- uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-- uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-- uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-- uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-- uint32_t src6d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
-- uint32_t src7d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
-- uint32_t src8d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
-- uint16_t src9d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
-- *(uint16_t *) (dst+0*dpitch+0) = src1a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7a;
-- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8a;
-- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9a;
-- *(uint16_t *) (dst+1*dpitch+0) = src1b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7b;
-- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8b;
-- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9b;
-- *(uint16_t *) (dst+2*dpitch+0) = src1c;
-- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c;
-- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c;
-- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c;
-- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c;
-- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6c;
-- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7c;
-- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8c;
-- *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9c;
-- *(uint16_t *) (dst+3*dpitch+0) = src1d;
-- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d;
-- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d;
-- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d;
-- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5d;
-- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6d;
-- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7d;
-- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8d;
-- *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9d;
-- return TRUE;
-- }
-- else {
-- // Don't bother unrolling loops, since that won't help for more than around 8 operations.
-- // Instead, just call multiple fixed functions.
-- if (xdir >= 0) {
-- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- } else {
-- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
-- }
-- return TRUE;
-- }
-- break;
-- // TODO: Add more alignment checks for 32 pixel-wide cases for performance reasons?
-- // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,XXX)) and related cases could make a big difference here...
-- case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
-- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
-- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src3c = vld1q_u32((uint32_t *)(src+2*spitch+2*BYTES_PER_UINT32X4_T));
-- uint32x4_t src4c = vld1q_u32((uint32_t *)(src+2*spitch+3*BYTES_PER_UINT32X4_T));
-- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
-- uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
-- uint32x4_t src3d = vld1q_u32((uint32_t *)(src+3*spitch+2*BYTES_PER_UINT32X4_T));
-- uint32x4_t src4d = vld1q_u32((uint32_t *)(src+3*spitch+3*BYTES_PER_UINT32X4_T));
-- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
-- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
-- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
-- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
-- vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
-- vst1q_u32((uint32_t *)(dst+2*dpitch+2*BYTES_PER_UINT32X4_T),src3c);
-- vst1q_u32((uint32_t *)(dst+2*dpitch+3*BYTES_PER_UINT32X4_T),src4c);
-- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
-- vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
-- vst1q_u32((uint32_t *)(dst+3*dpitch+2*BYTES_PER_UINT32X4_T),src3d);
-- vst1q_u32((uint32_t *)(dst+3*dpitch+3*BYTES_PER_UINT32X4_T),src4d);
-- return TRUE;
-- }
-- break;
-- }
-+ }
-
- return FALSE;
- }
-@@ -1924,10 +872,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int
- if (rowsOverlap)
- {
- if (w > 64) {
-- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
- }
- else if (w == 64) {
-- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
- }
- else {
- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
-@@ -1936,10 +886,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int
- else
- {
- if (w > 64) {
-- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
- }
- else if (w == 64) {
-- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
-+ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
- }
- else {
- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
-@@ -1973,7 +925,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i
- if (xdir >= 0 || !rowsOverlap) {
- if (w >= 128) {
- BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
-- neon_memcpy(dst, src, w);
-+ //neon_memcpy(dst, src, w);
-+ memcpy(dst, src, w);
- UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
- }
- else
-@@ -1982,7 +935,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i
- else {
- if (w >= 128) {
- BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
-- neon_memmove(dst, src, w);
-+ //neon_memmove(dst, src, w);
-+ memmove(dst, src, w);
- UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
- }
- else
-@@ -2029,7 +983,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h,
- if (xdir >= 0 || !rowsOverlap) {
- if (w >= 42) {
- BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
-- neon_memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL);
-+ //neon_memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL);
-+ memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL);
- UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
- }
- else
-@@ -2038,7 +993,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h,
- else {
- if (w >= 42) {
- BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
-- neon_memmove(dst, src, w * BYTES_PER_24BPP_PIXEL);
-+ //neon_memmove(dst, src, w * BYTES_PER_24BPP_PIXEL);
-+ memmove(dst, src, w * BYTES_PER_24BPP_PIXEL);
- UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
- }
- else
-diff --git git/src/neon_memcpy.S git/src/neon_memcpy.S
-deleted file mode 100644
-index 5ecc5ce..0000000
---- git/src/neon_memcpy.S
-+++ /dev/null
-@@ -1,549 +0,0 @@
--/***************************************************************************
-- Copyright (c) 2009, Code Aurora Forum. All rights reserved.
--
-- Redistribution and use in source and binary forms, with or without
-- modification, are permitted provided that the following conditions are met:
-- * Redistributions of source code must retain the above copyright
-- notice, this list of conditions and the following disclaimer.
-- * Redistributions in binary form must reproduce the above copyright
-- notice, this list of conditions and the following disclaimer in the
-- documentation and/or other materials provided with the distribution.
-- * Neither the name of Code Aurora nor the names of its contributors may
-- be used to endorse or promote products derived from this software
-- without specific prior written permission.
--
-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-- POSSIBILITY OF SUCH DAMAGE.
-- ***************************************************************************/
--
--/***************************************************************************
-- Neon memcpy: Attempts to do a memcpy with Neon registers if possible,
-- Inputs:
-- dest: The destination buffer
-- src: The source buffer
-- n: The size of the buffer to transfer
-- Outputs:
--
--***************************************************************************/
--
--/*
-- * General note:
-- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
-- * However, it looks like the 2006 CodeSourcery Assembler has issues generating
-- * the correct object code for VPOP, resulting in horrific stack crashes.
-- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
-- * and VPOP->VLDMIA. We can revert this back once we update our toolchain.
-- *
-- * Also, VSHL swaps the source register and the shift-amount register
-- * around in 2006-q3. I've coded this incorrectly so it turns out correct
-- * in the object code, but we'll need to undo that later...
-- */
--
-- .code 32
-- .align 4
-- .globl neon_memcpy
-- .func
--
--neon_memcpy:
-- /*
-- * First, make sure we're not copying < 4 bytes. If so, we'll
-- * just handle it here.
-- */
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- stmdb sp!, {r0}
--#else
-- push {r0}
--#endif
-- cmp r2, #4
-- bgt neon_gt_4
-- /* Copy 0-4 bytes, if needed, and return.*/
-- cmp r2, #0
--neon_smallcopy_loop:
-- beq neon_smallcopy_done
-- ldrb r12, [r1], #1
-- subs r2, r2, #1
-- strb r12, [r0], #1
-- b neon_smallcopy_loop
--neon_smallcopy_done:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- ldmia sp!, {r0}
--#else
-- pop {r0}
--#endif
-- bx lr
--
-- /* Copy 4 or more bytes*/
--neon_gt_4:
-- /* Preload what we can...*/
-- pld [r0,#0]
-- pld [r1,#0]
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- stmdb sp!, {r4-r5}
--#else
-- push {r4-r5}
--#endif
--
--neon_check_align:
-- /* Check normal word alignment for target. */
-- ands r12, r0, #0x3
-- beq source_alignment_check
--
-- /*
-- * Target is not aligned. Step through until we get that
-- * word-aligned. This works better than a loop, according
-- * to our pipeline modeler.
-- */
-- cmp r12, #2
-- ldrb r3, [r1], #1
-- ldrleb r4, [r1], #1
-- ldrltb r5, [r1], #1
-- rsb r12, r12, #4
-- sub r2, r2, r12
-- strb r3, [r0], #1
-- strleb r4, [r0], #1
-- strltb r5, [r0], #1
--
--source_alignment_check:
-- ands r12, r1, #0x3
-- bne neon_memcpy_nonaligned /* Source is not word aligned.*/
--neon_try_16_align:
-- cmp r2, #64
-- blt neon_align_route
-- /* This is where we try 16-byte alignment. */
-- ands r12, r0, #0xf
-- beq neon_align_route
-- rsb r12, r12, #16
--neon_16_start:
-- sub r2, r2, r12
-- lsrs r3, r12, #2
--neon_align_16_4:
-- ldr r4, [r1], #4
-- subs r3, r3, #1
-- str r4, [r0], #4
-- bne neon_align_16_4
--neon_align_route:
-- /* In this case, both source and target are word-aligned. */
-- cmp r2, #32768
-- bge neon_copy_128p_a
-- cmp r2, #256
-- bge neon_copy_128_a
-- cmp r2, #64
-- bge neon_copy_32_a
-- b neon_copy_finish_a
-- nop
--neon_copy_128p_a:
-- /* We'll copy blocks 128-bytes at a time, but try to call pld to
-- * load in the next page, if possible.
-- */
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vstmdb sp!, {q4-q7}
--#else
-- vpush {q4-q7}
--#endif
-- mov r12, r2, lsr #7
--neon_copy_128p_loop_a:
-- vld1.32 {q0, q1}, [r1]!
-- vld1.32 {q2, q3}, [r1]!
-- vld1.32 {q4, q5}, [r1]!
-- vld1.32 {q6, q7}, [r1]!
-- pld [r1, #0]
-- pld [r1, #1024]
-- vst1.32 {q0, q1}, [r0]!
-- vst1.32 {q2, q3}, [r0]!
-- vst1.32 {q4, q5}, [r0]!
-- vst1.32 {q6, q7}, [r0]!
-- subs r12, r12, #1
-- bne neon_copy_128p_loop_a
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vldmia sp!, {q4-q7}
--#else
-- vpop {q4-q7}
--#endif
-- ands r2, r2, #0x7f
-- beq neon_end
-- cmp r2, #32
-- blt neon_copy_finish_a
-- b neon_copy_32_a
-- /* Copy blocks of 128-bytes (word-aligned) at a time*/
--neon_copy_128_a:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vstmdb sp!, {q4-q7}
--#else
-- vpush {q4-q7}
--#endif
-- /*
-- * Move to a 1-s based countdown to determine when to loop. That
-- * allows the subs to set the Z flag without having to explicitly
-- * call cmp to a value.
-- */
-- mov r12, r2, lsr #7
--neon_copy_128_loop_a:
-- vld1.32 {q0, q1}, [r1]!
-- vld1.32 {q2, q3}, [r1]!
-- vld1.32 {q4, q5}, [r1]!
-- vld1.32 {q6, q7}, [r1]!
-- pld [r1, #0]
-- pld [r1, #128]
-- vst1.32 {q0, q1}, [r0]!
-- vst1.32 {q2, q3}, [r0]!
-- vst1.32 {q4, q5}, [r0]!
-- vst1.32 {q6, q7}, [r0]!
-- subs r12, r12, #1
-- pld [r0, #0]
-- pld [r0, #128]
-- bne neon_copy_128_loop_a
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vldmia sp!, {q4-q7}
--#else
-- vpop {q4-q7}
--#endif
-- ands r2, r2, #0x7f
-- beq neon_end
-- cmp r2, #32
-- blt neon_copy_finish_a
-- /* Copy blocks of 32-bytes (word aligned) at a time*/
--neon_copy_32_a:
-- mov r12, r2, lsr #5
--neon_copy_32_loop_a:
-- vld1.32 {q0,q1}, [r1]!
-- subs r12, r12, #1
-- pld [r1,#0]
-- vst1.32 {q0,q1}, [r0]!
-- bne neon_copy_32_loop_a
-- ands r2, r2, #0x1f
-- beq neon_end
--neon_copy_finish_a:
--neon_copy_16_a:
-- movs r12, r2, lsr #4
-- beq neon_copy_8_a
--neon_copy_16_a_loop:
-- vld1.32 {q0}, [r1]!
-- subs r12, r12, #1
-- vst1.32 {q0}, [r0]!
-- bne neon_copy_16_a_loop
-- ands r2, r2, #0xf
-- beq neon_end
--neon_copy_8_a:
-- cmp r2, #8
-- blt neon_copy_4_a
-- ldm r1!, {r4-r5}
-- subs r2, r2, #8
-- stm r0!, {r4-r5}
-- /* Copy 4-bytes of word-aligned data at a time*/
--neon_copy_4_a:
-- cmp r2, #4
-- blt neon_copy_finish
-- ldr r4, [r1], #4
-- subs r2, r2, #4
-- str r4, [r0], #4
-- b neon_copy_finish
--
-- /*
-- * Handle unaligned data. The basic concept here is that we'll
-- * try to pull out enough data from the source to get that word-
-- * aligned, then do our writes word-aligned, storing the difference
-- * in a register, and shifting the data as needed.
-- */
--neon_memcpy_nonaligned:
-- /*
-- * If this is <8 bytes, it makes more sense to just copy it
-- * quickly instead of incurring all kinds of overhead.
-- */
-- cmp r2, #8 /* Let's try this...*/
-- ble neon_copy_finish
-- /*
-- * This is where we'll pull out either 1, 2, or 3 bytes of data
-- * from the source as needed to align it, then store off those
-- * bytes in r4. When we read in the (now) aligned data from the
-- * source, we'll shift the bytes and AND in the r4 data, then write
-- * to the target aligned.
-- *
-- * The conditional ldr calls work slightly faster than the
-- * previous method, confirmed by our pipeline modeler.
-- */
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- stmdb sp!, {r6-r9}
--#else
-- push {r6-r9}
--#endif
-- cmp r12, #2
-- ldrb r4, [r1], #1
-- ldrleb r5, [r1], #1
-- ldrltb r6, [r1], #1
-- rsb r8, r12, #4
-- sub r2, r2, r8
-- lsl r8, r8, #3
-- orrle r4, r4, r5, lsl #8
-- orrlt r4, r4, r6, lsl #16
-- rsb r9, r8, #32
--
-- cmp r2, #64
-- blt neon_unaligned_route
-- ands r12, r0, #0xf
-- beq neon_unaligned_route
-- rsb r12, r12, #16
--neon_16_start_u:
-- sub r2, r2, r12
-- lsrs r6, r12, #2
--neon_align_16_4_u:
-- ldr r5, [r1], #4
-- subs r6, r6, #1
-- orr r4, r4, r5, lsl r8
-- str r4, [r0], #4
-- mov r4, r5, lsr r9
-- bne neon_align_16_4_u
--neon_unaligned_route:
-- /* Decide which loop block to branch to.*/
-- cmp r2, #256
-- bge neon_copy_64_u
-- cmp r2, #64
-- bge neon_copy_32_u
-- b neon_copy_finish_u
-- /* Copy data in 64-byte blocks.*/
--neon_copy_64_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vstmdb sp!, {q4}
-- vstmdb sp!, {q5-q8}
--#else
-- vpush {q4}
-- vpush {q5-q8}
--#endif
-- /* We'll need this for the q register shift later.*/
-- vdup.u32 q8, r8
-- /*
-- * As above, we determine how many times we can go through the
-- * 64-byte copy loop, then countdown.
-- */
-- mov r12, r2, lsr #6
-- and r2, r2, #0x3f
--neon_copy_64_u_loop:
-- /* Load 64-bytes into q4-q7.*/
-- vld1.32 {q4, q5}, [r1]!
-- vld1.32 {q6, q7}, [r1]!
-- /*
-- * Shift q0-q3 right so everything but the data we need due to the
-- * alignment falls off the right-hand side. The branching
-- * is needed, since vshr requires the shift to be an immediate
-- * value.
-- */
-- lsls r5, r8, #28
-- bcc neon_copy_64_u_b8
-- bpl neon_copy_64_u_b16
-- vshr.u64 q0, q4, #40
-- vshr.u64 q1, q5, #40
-- vshr.u64 q2, q6, #40
-- vshr.u64 q3, q7, #40
-- b neon_copy_64_unify
--neon_copy_64_u_b8:
-- vshr.u64 q0, q4, #56
-- vshr.u64 q1, q5, #56
-- vshr.u64 q2, q6, #56
-- vshr.u64 q3, q7, #56
-- b neon_copy_64_unify
--neon_copy_64_u_b16:
-- vshr.u64 q0, q4, #48
-- vshr.u64 q1, q5, #48
-- vshr.u64 q2, q6, #48
-- vshr.u64 q3, q7, #48
--neon_copy_64_unify:
-- /*
-- * Shift q4-q7 left by r8 bits to take the alignment into
-- * account.
-- */
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vshl.u64 q4, q8, q4
-- vshl.u64 q5, q8, q5
-- vshl.u64 q6, q8, q6
-- vshl.u64 q7, q8, q7
--#else
-- vshl.u64 q4, q4, q8
-- vshl.u64 q5, q5, q8
-- vshl.u64 q6, q6, q8
-- vshl.u64 q7, q7, q8
--#endif
-- /*
-- * The data in s14 will be needed for the next loop iteration. Move
-- * that to r5.
-- */
-- vmov r5, s14
-- /* We'll vorr the shifted data with the data that needs to move back.*/
-- vorr d9, d9, d0
-- /* Copy the data from the previous loop into s14.*/
-- vmov s14, r4
-- vorr d10, d10, d1
-- vorr d11, d11, d2
-- vorr d12, d12, d3
-- vorr d13, d13, d4
-- vorr d14, d14, d5
-- vorr d15, d15, d6
-- vorr d8, d8, d7
-- subs r12, r12, #1
-- pld [r1, #0]
-- pld [r1, #128]
-- /* Save off the r5 data into r4 for the next iteration.*/
-- mov r4, r5
-- vst1.32 {q4, q5}, [r0]!
-- vst1.32 {q6, q7}, [r0]!
-- pld [r0, #0]
-- pld [r0, #128]
-- bne neon_copy_64_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vldmia sp!, {q5-q8}
-- vldmia sp!, {q4}
--#else
-- vpop {q5-q8}
-- vpop {q4}
--#endif
-- cmp r2, #32
-- bge neon_copy_32_u
-- b neon_copy_finish_u
--neon_copy_32_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vstmdb sp!, {q4}
--#else
-- vpush {q4}
--#endif
-- vdup.u32 q4, r8
-- mov r12, r2, lsr #5
-- and r2, r2, #0x1f
--neon_copy_32_u_loop:
-- vld1.32 {q0, q1}, [r1]!
-- lsls r5, r8, #28
-- bcc neon_copy_32_u_b8
-- bpl neon_copy_32_u_b16
-- vshr.u64 q2, q0, #40
-- vshr.u64 q3, q1, #40
-- b neon_copy_32_unify
--neon_copy_32_u_b8:
-- vshr.u64 q2, q0, #56
-- vshr.u64 q3, q1, #56
-- b neon_copy_32_unify
--neon_copy_32_u_b16:
-- vshr.u64 q2, q0, #48
-- vshr.u64 q3, q1, #48
--neon_copy_32_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vshl.u64 q0, q4, q0
-- vshl.u64 q1, q4, q1
--#else
-- vshl.u64 q0, q0, q4
-- vshl.u64 q1, q1, q4
--#endif
-- vmov r5, s14
-- vorr d1, d1, d4
-- vmov s14, r4
-- vorr d2, d2, d5
-- vorr d3, d3, d6
-- vorr d0, d0, d7
-- subs r12, r12, #1
-- pld [r1, #0]
-- mov r4, r5
-- vst1.32 {q0, q1}, [r0]!
-- bne neon_copy_32_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vldmia sp!, {q4}
--#else
-- vpop {q4}
--#endif
--neon_copy_finish_u:
--neon_copy_16_u:
-- movs r12, r2, lsr #4
-- beq neon_copy_8_u
-- vdup.u32 q2, r8
-- and r2, r2, #0xf
--neon_copy_16_u_loop:
-- vld1.32 {q0}, [r1]!
-- lsls r5, r8, #28
-- bcc neon_copy_16_u_b8
-- bpl neon_copy_16_u_b16
-- vshr.u64 q1, q0, #40
-- b neon_copy_16_unify
--neon_copy_16_u_b8:
-- vshr.u64 q1, q0, #56
-- b neon_copy_16_unify
--neon_copy_16_u_b16:
-- vshr.u64 q1, q0, #48
--neon_copy_16_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vshl.u64 q0, q2, q0
--#else
-- vshl.u64 q0, q0, q2
--#endif
-- vmov r5, s6
-- vorr d1, d1, d2
-- vmov s6, r4
-- vorr d0, d0, d3
-- subs r12, r12, #1
-- mov r4, r5
-- vst1.32 {q0}, [r0]!
-- bne neon_copy_16_u_loop
--neon_copy_8_u:
-- cmp r2, #8
-- blt neon_copy_4_u
-- ldm r1!, {r6-r7}
-- subs r2, r2, #8
-- orr r4, r4, r6, lsl r8
-- mov r5, r6, lsr r9
-- orr r5, r5, r7, lsl r8
-- stm r0!, {r4-r5}
-- mov r4, r7, lsr r9
--neon_copy_4_u:
-- cmp r2, #4
-- blt neon_copy_last_bits_u
-- ldr r5, [r1], #4
-- subs r2, r2, #4
-- orr r4, r4, r5, lsl r8
-- str r4, [r0], #4
-- mov r4, r5, lsr r9
--neon_copy_last_bits_u:
-- /*
-- * Remember, r8 contains the size of the data in r4 in bits,
-- * so to get to bytes we'll need to shift 3 places
-- */
-- lsr r8, r8, #0x3
-- /* Write out the bytes stored in r4.*/
--neon_copy_last_bits_u_loop:
-- strb r4, [r0], #1
-- subs r8, r8, #1
-- lsrne r4, r4, #8
-- bne neon_copy_last_bits_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- ldmia sp!, {r6-r9}
--#else
-- pop {r6-r9}
--#endif
--neon_copy_finish:
-- cmp r2, #0
-- beq neon_end
-- /*
-- * This just copies the data from source to target one byte
-- * at a time. For some small values, this makes more sense.
-- * Note that since this code copies data a byte at a time,
-- * both the aligned and unaligned paths can use it.
-- */
--neon_copy_finish_loop:
-- ldrb r4, [r1], #1
-- subs r2, r2, #1
-- strb r4, [r0], #1
-- bne neon_copy_finish_loop
--neon_end:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- ldmia sp!, {r4-r5}
-- ldmia sp!, {r0}
--#else
-- pop {r4-r5}
-- pop {r0}
--#endif
-- bx lr
--
-- .endfunc
-- .end
-diff --git git/src/neon_memmove.S git/src/neon_memmove.S
-deleted file mode 100644
-index 1bfe597..0000000
---- git/src/neon_memmove.S
-+++ /dev/null
-@@ -1,939 +0,0 @@
--/***************************************************************************
-- Copyright (c) 2009, Code Aurora Forum. All rights reserved.
--
-- Redistribution and use in source and binary forms, with or without
-- modification, are permitted provided that the following conditions are met:
-- * Redistributions of source code must retain the above copyright
-- notice, this list of conditions and the following disclaimer.
-- * Redistributions in binary form must reproduce the above copyright
-- notice, this list of conditions and the following disclaimer in the
-- documentation and/or other materials provided with the distribution.
-- * Neither the name of Code Aurora nor the names of its contributors may
-- be used to endorse or promote products derived from this software
-- without specific prior written permission.
--
-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-- POSSIBILITY OF SUCH DAMAGE.
-- ***************************************************************************/
--
--/***************************************************************************
-- * Neon memmove: Attempts to do a memmove with Neon registers if possible,
-- * Inputs:
-- * dest: The destination buffer
-- * src: The source buffer
-- * n: The size of the buffer to transfer
-- * Outputs:
-- *
-- ***************************************************************************/
--
--/*
-- * General note:
-- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
-- * However, it looks like the 2006 CodeSourcery Assembler has issues generating
-- * the correct object code for VPOP, resulting in horrific stack crashes.
-- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
-- * and VPOP->VLDMIA. We can revert this back once we update our toolchain.
-- *
-- * Also, VSHL swaps the source register and the shift-amount register
-- * around in 2006-q3. I've coded this incorrectly so it turns out correct
-- * in the object code, but we'll need to undo that later...
-- */
-- .code 32
-- .align 4
-- .globl neon_memmove
-- .func
--
--neon_memmove:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- stmdb sp!, {r0}
--#else
-- push {r0}
--#endif
--
-- /*
-- * The requirements for memmove state that the function should
-- * operate as if data were being copied from the source to a
-- * buffer, then to the destination. This is to allow a user
-- * to copy data from a source and target that overlap.
-- *
-- * We can't just do byte copies front-to-back automatically, since
-- * there's a good chance we may have an overlap (why else would someone
-- * intentionally use memmove then?).
-- *
-- * We'll break this into two parts. Front-to-back, or back-to-front
-- * copies.
-- */
--neon_memmove_cmf:
-- cmp r0, r1
-- blt neon_front_to_back_copy
-- bgt neon_back_to_front_copy
-- b neon_memmove_done
--
-- /* #############################################################
-- * Front to Back copy
-- */
--neon_front_to_back_copy:
-- /*
-- * For small copies, just do a quick memcpy. We can do this for
-- * front-to-back copies, aligned or unaligned, since we're only
-- * doing 1 byte at a time...
-- */
-- cmp r2, #4
-- bgt neon_f2b_gt4
-- cmp r2, #0
--neon_f2b_smallcopy_loop:
-- beq neon_memmove_done
-- ldrb r12, [r1], #1
-- subs r2, r2, #1
-- strb r12, [r0], #1
-- b neon_f2b_smallcopy_loop
--neon_f2b_gt4:
-- /* Preload what we can...*/
-- pld [r0,#0]
-- pld [r1,#0]
-- /* The window size is in r3. */
-- sub r3, r1, r0
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- stmdb sp!, {r4-r6}
--#else
-- push {r4-r6}
--#endif
--
--neon_f2b_check_align:
-- /* Check alignment. */
-- ands r12, r0, #0x3
-- beq neon_f2b_source_align_check
-- cmp r12, #2
-- ldrb r4, [r1], #1
-- ldrleb r5, [r1], #1
-- ldrltb r6, [r1], #1
-- rsb r12, r12, #4
-- sub r2, r2, r12
-- strb r4, [r0], #1
-- strleb r5, [r0], #1
-- strltb r6, [r0], #1
--
--neon_f2b_source_align_check:
-- ands r12, r1, #0x3
-- bne neon_f2b_nonaligned
--
--neon_f2b_try_16_align:
-- /* If we're >64, attempt to align on 16-bytes. Smaller amounts
-- * don't seem to be worth handling. */
-- cmp r2, #64
-- blt neon_f2b_align_route
-- /* This is where we try 16-byte alignment. */
-- ands r12, r0, #0xf
-- beq neon_f2b_align_route
-- rsb r12, r12, #16
--neon_f2b_16_start:
-- sub r2, r2, r12
-- lsrs r5, r12, #2
--neon_f2b_align_16_4:
-- ldr r4, [r1], #4
-- subs r5, r5, #1
-- str r4, [r0], #4
-- bne neon_f2b_align_16_4
--neon_f2b_align_route:
-- /* #############################################################
-- * Front to Back copy - aligned
-- */
-- /*
-- * Note that we can't just route based on the size in r2. If that's
-- * larger than the overlap window in r3, we could potentially
-- * (and likely!) destroy data we're copying.
-- */
-- cmp r2, r3
-- movle r12, r2
-- movgt r12, r3
-- cmp r12, #256
-- bge neon_f2b_copy_128_a
-- cmp r12, #64
-- bge neon_f2b_copy_32_a
-- cmp r12, #16
-- bge neon_f2b_copy_16_a
-- cmp r12, #8
-- bge neon_f2b_copy_8_a
-- cmp r12, #4
-- bge neon_f2b_copy_4_a
-- b neon_f2b_copy_1_a
--neon_f2b_copy_128_a:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vstmdb sp!, {q4-q7}
--#else
-- vpush {q4-q7}
--#endif
-- mov r12, r2, lsr #7
--neon_f2b_copy_128_a_loop:
-- vld1.32 {q0,q1}, [r1]!
-- vld1.32 {q2,q3}, [r1]!
-- vld1.32 {q4,q5}, [r1]!
-- vld1.32 {q6,q7}, [r1]!
-- pld [r1, #0]
-- pld [r1, #128]
-- vst1.32 {q0,q1}, [r0]!
-- vst1.32 {q2,q3}, [r0]!
-- vst1.32 {q4,q5}, [r0]!
-- vst1.32 {q6,q7}, [r0]!
-- subs r12, r12, #1
-- pld [r0, #0]
-- pld [r0, #128]
-- bne neon_f2b_copy_128_a_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vldmia sp!, {q4-q7}
--#else
-- vpop {q4-q7}
--#endif
-- ands r2, r2, #0x7f
-- beq neon_f2b_finish
-- cmp r2, #32
-- bge neon_f2b_copy_32_a
-- b neon_f2b_copy_finish_a
--neon_f2b_copy_32_a:
-- mov r12, r2, lsr #5
--neon_f2b_copy_32_a_loop:
-- vld1.32 {q0,q1}, [r1]!
-- subs r12, r12, #1
-- pld [r1, #0]
-- vst1.32 {q0,q1}, [r0]!
-- bne neon_f2b_copy_32_a_loop
-- ands r2, r2, #0x1f
-- beq neon_f2b_finish
--neon_f2b_copy_finish_a:
--neon_f2b_copy_16_a:
-- movs r12, r2, lsr #4
-- beq neon_f2b_copy_8_a
--neon_f2b_copy_16_a_loop:
-- vld1.32 {q0}, [r1]!
-- subs r12, r12, #1
-- vst1.32 {q0}, [r0]!
-- bne neon_f2b_copy_16_a_loop
-- ands r2, r2, #0xf
-- beq neon_f2b_finish
--neon_f2b_copy_8_a:
-- cmp r2, #8
-- blt neon_f2b_copy_4_a
-- ldm r1!, {r4-r5}
-- subs r2, r2, #8
-- stm r0!, {r4-r5}
--neon_f2b_copy_4_a:
-- cmp r2, #4
-- blt neon_f2b_copy_1_a
-- ldr r4, [r1], #4
-- subs r2, r2, #4
-- str r4, [r0], #4
--neon_f2b_copy_1_a:
-- cmp r2, #0
-- beq neon_f2b_finish
--neon_f2b_copy_1_a_loop:
-- ldrb r12, [r1], #1
-- subs r2, r2, #1
-- strb r12, [r0], #1
-- bne neon_f2b_copy_1_a_loop
-- b neon_f2b_finish
--
-- /* #############################################################
-- * Front to Back copy - unaligned
-- */
--neon_f2b_nonaligned:
-- /*
-- * For sizes < 8, does it really make sense to do the whole shift
-- * party? Note that we DON'T want to call neon_f2b_copy_1_u,
-- * since we'll end up trying to pop r8-r11, and we DON'T want
-- * to do that...
-- */
-- cmp r2, #8
-- ble neon_f2b_copy_1_a
--
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- stmdb sp!, {r7-r9}
--#else
-- push {r7-r9}
--#endif
-- cmp r12, #2
-- ldrb r4, [r1], #1
-- ldrleb r5, [r1], #1
-- ldrltb r6, [r1], #1
-- rsb r8, r12, #4
-- sub r2, r2, r8
-- lsl r8, r8, #3
-- orrle r4, r4, r5, lsl #8
-- orrlt r4, r4, r6, lsl #16
-- rsb r9, r8, #32
-- /*
-- * r4 = overflow bits
-- * r8 = # of bits we copied into the r4 register to align source.
-- * r9 = 32 - r8
-- * r12 = Index counter for each size, so we determine how many times
-- * the given size will go into r2, then count down that # of
-- * times in r12.
-- */
-- cmp r2, #64
-- blt neon_f2b_unaligned_route
-- ands r12, r0, #0xf
-- beq neon_f2b_unaligned_route
-- cmp r3, #4
-- blt neon_f2b_unaligned_route
-- rsb r12, r12, #16
--neon_f2b_16_start_u:
-- sub r2, r2, r12
-- lsrs r6, r12, #2
--neon_f2b_align_16_4_u:
-- ldr r5, [r1], #4
-- subs r6, r6, #1
-- orr r4, r4, r5, lsl r8
-- str r4, [r0], #4
-- mov r4, r5, lsr r9
-- bne neon_f2b_align_16_4_u
--neon_f2b_unaligned_route:
-- cmp r2, r3
-- movle r12, r2
-- movgt r12, r3
-- cmp r12, #256
-- bge neon_f2b_copy_64_u
-- cmp r12, #64
-- bge neon_f2b_copy_32_u
-- cmp r12, #16
-- bge neon_f2b_copy_16_u
-- cmp r12, #8
-- bge neon_f2b_copy_8_u
-- cmp r12, #4
-- bge neon_f2b_copy_4_u
-- b neon_f2b_last_bits_u
--neon_f2b_copy_64_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vstmdb sp!, {q4}
-- vstmdb sp!, {q5-q8}
--#else
-- vpush {q4}
-- vpush {q5-q8}
--#endif
-- vdup.u32 q8, r8
-- mov r12, r2, lsr #6
-- and r2, r2, #0x3f
--neon_f2b_copy_64_u_loop:
-- vld1.32 {q4, q5}, [r1]!
-- vld1.32 {q6, q7}, [r1]!
-- lsls r5, r8, #28
-- bcc neon_f2b_copy_64_u_b8
-- bpl neon_f2b_copy_64_u_b16
-- vshr.u64 q0, q4, #40
-- vshr.u64 q1, q5, #40
-- vshr.u64 q2, q6, #40
-- vshr.u64 q3, q7, #40
-- b neon_f2b_copy_64_unify
--neon_f2b_copy_64_u_b8:
-- vshr.u64 q0, q4, #56
-- vshr.u64 q1, q5, #56
-- vshr.u64 q2, q6, #56
-- vshr.u64 q3, q7, #56
-- b neon_f2b_copy_64_unify
--neon_f2b_copy_64_u_b16:
-- vshr.u64 q0, q4, #48
-- vshr.u64 q1, q5, #48
-- vshr.u64 q2, q6, #48
-- vshr.u64 q3, q7, #48
--neon_f2b_copy_64_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vshl.u64 q4, q8, q4
-- vshl.u64 q5, q8, q5
-- vshl.u64 q6, q8, q6
-- vshl.u64 q7, q8, q7
--#else
-- vshl.u64 q4, q4, q8
-- vshl.u64 q5, q5, q8
-- vshl.u64 q6, q6, q8
-- vshl.u64 q7, q7, q8
--#endif
-- vmov r5, s14
-- vorr d9, d9, d0
-- vmov s14, r4
-- vorr d10, d10, d1
-- vorr d11, d11, d2
-- vorr d12, d12, d3
-- vorr d13, d13, d4
-- vorr d14, d14, d5
-- vorr d15, d15, d6
-- vorr d8, d8, d7
-- subs r12, r12, #1
-- pld [r1, #0]
-- pld [r1, #128]
-- mov r4, r5
-- vst1.32 {q4, q5}, [r0]!
-- vst1.32 {q6, q7}, [r0]!
-- pld [r0, #0]
-- pld [r0, #128]
-- bne neon_f2b_copy_64_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vldmia sp!, {q5-q8}
-- vldmia sp!, {q4}
--#else
-- vpop {q5-q8}
-- vpop {q4}
--#endif
-- cmp r2, #32
-- bge neon_f2b_copy_32_u
-- b neon_f2b_copy_finish_u
--neon_f2b_copy_32_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vstmdb sp!, {q4}
--#else
-- vpush {q4}
--#endif
-- vdup.u32 q4, r8
-- mov r12, r2, lsr #5
-- and r2, r2, #0x1f
--neon_f2b_copy_32_u_loop:
-- vld1.32 {q0, q1}, [r1]!
-- lsls r5, r8, #28
-- bcc neon_f2b_copy_32_u_b8
-- bpl neon_f2b_copy_32_u_b16
-- vshr.u64 q2, q0, #40
-- vshr.u64 q3, q1, #40
-- b neon_f2b_copy_32_unify
--neon_f2b_copy_32_u_b8:
-- vshr.u64 q2, q0, #56
-- vshr.u64 q3, q1, #56
-- b neon_f2b_copy_32_unify
--neon_f2b_copy_32_u_b16:
-- vshr.u64 q2, q0, #48
-- vshr.u64 q3, q1, #48
--neon_f2b_copy_32_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vshl.u64 q0, q4, q0
-- vshl.u64 q1, q4, q1
--#else
-- vshl.u64 q0, q0, q4
-- vshl.u64 q1, q1, q4
--#endif
-- vmov r5, s14
-- vorr d1, d1, d4
-- vmov s14, r4
-- vorr d2, d2, d5
-- vorr d3, d3, d6
-- vorr d0, d0, d7
-- subs r12, r12, #1
-- pld [r1, #0]
-- mov r4, r5
-- vst1.32 {q0, q1}, [r0]!
-- bne neon_f2b_copy_32_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vldmia sp!, {q4}
--#else
-- vpop {q4}
--#endif
--neon_f2b_copy_finish_u:
--neon_f2b_copy_16_u:
-- movs r12, r2, lsr #4
-- beq neon_f2b_copy_8_u
-- vdup.u32 q2, r8
-- and r2, r2, #0xf
--neon_f2b_copy_16_u_loop:
-- vld1.32 {q0}, [r1]!
-- lsls r5, r8, #28
-- bcc neon_f2b_copy_16_u_b8
-- bpl neon_f2b_copy_16_u_b16
-- vshr.u64 q1, q0, #40
-- b neon_f2b_copy_16_unify
--neon_f2b_copy_16_u_b8:
-- vshr.u64 q1, q0, #56
-- b neon_f2b_copy_16_unify
--neon_f2b_copy_16_u_b16:
-- vshr.u64 q1, q0, #48
--neon_f2b_copy_16_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vshl.u64 q0, q2, q0
--#else
-- vshl.u64 q0, q0, q2
--#endif
-- vmov r5, s6
-- vorr d1, d1, d2
-- vmov s6, r4
-- vorr d0, d0, d3
-- subs r12, r12, #1
-- mov r4, r5
-- vst1.32 {q0}, [r0]!
-- bne neon_f2b_copy_16_u_loop
--neon_f2b_copy_8_u:
-- cmp r2, #8
-- blt neon_f2b_copy_4_u
-- ldm r1!, {r6-r7}
-- subs r2, r2, #8
-- orr r4, r4, r6, lsl r8
-- mov r5, r6, lsr r9
-- orr r5, r5, r7, lsl r8
-- stm r0!, {r4-r5}
-- mov r4, r7, lsr r9
--neon_f2b_copy_4_u:
-- cmp r2, #4
-- blt neon_f2b_last_bits_u
-- ldr r5, [r1], #4
-- subs r2, r2, #4
-- orr r4, r4, r5, lsl r8
-- str r4, [r0], #4
-- mov r4, r5, lsr r9
--neon_f2b_last_bits_u:
-- lsr r8, r8, #0x3
--neon_f2b_last_bits_u_loop:
-- strb r4, [r0], #1
-- subs r8, r8, #1
-- lsr r4, r4, #8
-- bne neon_f2b_last_bits_u_loop
--neon_f2b_copy_1_u:
-- cmp r2, #0
-- beq neon_f2b_finish_u
--neon_f2b_copy_1_u_loop:
-- ldrb r12, [r1], #1
-- subs r2, r2, #1
-- strb r12, [r0], #1
-- bne neon_f2b_copy_1_u_loop
--neon_f2b_finish_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- ldmia sp!, {r7-r9}
--#else
-- pop {r7-r9}
--#endif
-- /* #############################################################
-- * Front to Back copy - finish
-- */
--neon_f2b_finish:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- ldmia sp!, {r4-r6}
--#else
-- pop {r4-r6}
--#endif
-- b neon_memmove_done
--
-- /* #############################################################
-- * Back to Front copy
-- */
--neon_back_to_front_copy:
-- /*
-- * Here, we'll want to shift to the end of the buffers. This
-- * actually points us one past where we need to go, but since
-- * we'll pre-decrement throughout, this will be fine.
-- */
-- add r0, r0, r2
-- add r1, r1, r2
-- cmp r2, #4
-- bgt neon_b2f_gt4
-- cmp r2, #0
--neon_b2f_smallcopy_loop:
-- beq neon_memmove_done
-- ldrb r12, [r1, #-1]!
-- subs r2, r2, #1
-- strb r12, [r0, #-1]!
-- b neon_b2f_smallcopy_loop
--neon_b2f_gt4:
-- pld [r0, #0]
-- pld [r1, #0]
-- /*
-- * The minimum of the overlap window size and the copy size
-- * is in r3.
-- */
-- sub r3, r0, r1
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- stmdb sp!, {r4-r5}
--#else
-- push {r4-r5}
--#endif
--
-- /*
-- * Check alignment. Since we'll pre-decrement as we step thru, we'll
-- * need to make sure we're on word-alignment.
-- */
--neon_b2f_check_align:
-- ands r12, r0, #0x3
-- beq neon_b2f_source_align_check
-- sub r2, r2, r12
--neon_b2f_shift_align:
-- ldrb r4, [r1, #-1]!
-- subs r12, r12, #1
-- strb r4, [r0, #-1]!
-- bne neon_b2f_shift_align
--neon_b2f_source_align_check:
-- ands r4, r1, #0x3
-- bne neon_b2f_nonaligned
--
--neon_b2f_try_16_align:
-- /* If we're >64, attempt to align on 16-bytes. Smaller amounts
-- * don't seem to be worth handling. */
-- cmp r2, #64
-- blt neon_b2f_align_route
-- ands r12, r0, #0xf
-- beq neon_b2f_align_route
-- /* In this case, r12 has the number of bytes to roll backward. */
--neon_b2f_16_start:
-- sub r2, r2, r12
-- lsrs r5, r12, #2
--neon_b2f_align_16_4:
-- ldr r4, [r1, #-4]!
-- subs r5, r5, #1
-- str r4, [r0, #-4]!
-- bne neon_b2f_align_16_4
--neon_b2f_align_route:
-- /*
-- * #############################################################
-- * Back to Front copy - aligned
-- */
-- cmp r2, r3
-- movle r12, r2
-- movgt r12, r3
-- cmp r12, #256
-- bge neon_b2f_copy_128_a
-- cmp r12, #64
-- bge neon_b2f_copy_32_a
-- cmp r12, #8
-- bge neon_b2f_copy_8_a
-- cmp r12, #4
-- bge neon_b2f_copy_4_a
-- b neon_b2f_copy_1_a
--neon_b2f_copy_128_a:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vstmdb sp!, {q4-q7}
--#else
-- vpush {q4-q7}
--#endif
-- movs r12, r2, lsr #7
-- /*
-- * This irks me. There MUST be a better way to read these in and
-- * scan the register backward instead of making it go forward. Then
-- * we need to do two subtractions...
-- */
--neon_b2f_copy_128_a_loop:
-- sub r1, r1, #128
-- sub r0, r0, #128
-- vld1.32 {q0, q1}, [r1]!
-- vld1.32 {q2, q3}, [r1]!
-- vld1.32 {q4, q5}, [r1]!
-- vld1.32 {q6, q7}, [r1]!
-- pld [r1, #-128]
-- pld [r1, #-256]
-- vst1.32 {q0, q1}, [r0]!
-- vst1.32 {q2, q3}, [r0]!
-- vst1.32 {q4, q5}, [r0]!
-- vst1.32 {q6, q7}, [r0]!
-- subs r12, r12, #1
-- pld [r0, #-128]
-- pld [r0, #-256]
-- sub r1, r1, #128
-- sub r0, r0, #128
-- bne neon_b2f_copy_128_a_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vldmia sp!, {q4-q7}
--#else
-- vpop {q4-q7}
--#endif
-- ands r2, r2, #0x7f
-- beq neon_b2f_finish
-- cmp r2, #32
-- bge neon_b2f_copy_32_a
-- b neon_b2f_copy_finish_a
--neon_b2f_copy_32_a:
-- mov r12, r2, lsr #5
--neon_b2f_copy_32_a_loop:
-- sub r1, r1, #32
-- sub r0, r0, #32
-- vld1.32 {q0,q1}, [r1]
-- subs r12, r12, #1
-- vst1.32 {q0,q1}, [r0]
-- pld [r1, #0]
-- bne neon_b2f_copy_32_a_loop
-- ands r2, r2, #0x1f
-- beq neon_b2f_finish
--neon_b2f_copy_finish_a:
--neon_b2f_copy_8_a:
-- movs r12, r2, lsr #0x3
-- beq neon_b2f_copy_4_a
--neon_b2f_copy_8_a_loop:
-- ldmdb r1!, {r4-r5}
-- subs r12, r12, #1
-- stmdb r0!, {r4-r5}
-- bne neon_b2f_copy_8_a_loop
-- and r2, r2, #0x7
--neon_b2f_copy_4_a:
-- movs r12, r2, lsr #0x2
-- beq neon_b2f_copy_1_a
-- and r2, r2, #0x3
--neon_b2f_copy_4_a_loop:
-- ldr r4, [r1, #-4]!
-- subs r12, r12, #1
-- str r4, [r0, #-4]!
-- bne neon_b2f_copy_4_a_loop
--neon_b2f_copy_1_a:
-- cmp r2, #0
-- beq neon_b2f_finish
--neon_b2f_copy_1_a_loop:
-- ldrb r12, [r1, #-1]!
-- subs r2, r2, #1
-- strb r12, [r0, #-1]!
-- bne neon_b2f_copy_1_a_loop
--
-- /* #############################################################
-- * Back to Front copy - unaligned
-- */
--neon_b2f_nonaligned:
-- /*
-- * For sizes < 8, does it really make sense to do the whole shift
-- * party?
-- */
-- cmp r2, #8
-- ble neon_b2f_copy_1_a
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- stmdb sp!, {r6-r11}
--#else
-- push {r6-r11}
--#endif
-- /*
-- * r3 = max window size
-- * r4 = overflow bytes
-- * r5 = bytes we're reading into
-- * r6 = # bytes we're off.
-- * r10 = copy of r6
-- */
-- and r6, r1, #0x3
-- eor r4, r4, r4
-- mov r10, r6
--neon_b2f_realign:
-- ldrb r5, [r1, #-1]!
-- subs r6, r6, #1
-- orr r4, r5, r4, lsl #8
-- bne neon_b2f_realign
-- /*
-- * r10 = # of bits we copied into the r4 register to align source.
-- * r11 = 32 - r10
-- * r12 = Index counter for each size, so we determine how many times
-- * the given size will go into r2, then count down that # of
-- * times in r12.
-- */
-- sub r2, r2, r10
-- lsl r10, r10, #0x3
-- rsb r11, r10, #32
--
-- cmp r2, r3
-- movle r12, r2
-- movgt r12, r3
-- cmp r12, #256
-- bge neon_b2f_copy_64_u
-- cmp r12, #64
-- bge neon_b2f_copy_32_u
-- cmp r12, #8
-- bge neon_b2f_copy_8_u
-- cmp r12, #4
-- bge neon_b2f_copy_4_u
-- b neon_b2f_last_bits_u
--neon_b2f_copy_64_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vstmdb sp!, {q4,q5}
-- vstmdb sp!, {q6-q8}
--#else
-- vpush {q4,q5}
-- vpush {q6-q8}
--#endif
-- add r7, r11, #32
-- movs r12, r2, lsr #6
-- vdup.u32 q8, r7
--neon_b2f_copy_64_u_loop:
-- sub r1, r1, #64
-- sub r0, r0, #64
-- vld1.32 {q0, q1}, [r1]!
-- vld1.32 {q2, q3}, [r1]
-- sub r1, r1, #32
-- vmov q4, q0
-- vmov q5, q1
-- vmov q6, q2
-- vmov q7, q3
-- vmov r5, s0
-- mov r4, r4, lsl r11
-- lsls r6, r10, #28
-- bcc neon_b2f_copy_64_u_b8
-- bpl neon_b2f_copy_64_u_b16
-- vshr.u64 q0, q0, #24
-- vshr.u64 q1, q1, #24
-- vshr.u64 q2, q2, #24
-- vshr.u64 q3, q3, #24
-- b neon_b2f_copy_64_unify
--neon_b2f_copy_64_u_b8:
-- vshr.u64 q0, q0, #8
-- vshr.u64 q1, q1, #8
-- vshr.u64 q2, q2, #8
-- vshr.u64 q3, q3, #8
-- b neon_b2f_copy_64_unify
--neon_b2f_copy_64_u_b16:
-- vshr.u64 q0, q0, #16
-- vshr.u64 q1, q1, #16
-- vshr.u64 q2, q2, #16
-- vshr.u64 q3, q3, #16
--neon_b2f_copy_64_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vshl.u64 q4, q8, q4
-- vshl.u64 q5, q8, q5
-- vshl.u64 q6, q8, q6
-- vshl.u64 q7, q8, q7
--#else
-- vshl.u64 q4, q4, q8
-- vshl.u64 q5, q5, q8
-- vshl.u64 q6, q6, q8
-- vshl.u64 q7, q7, q8
--#endif
-- vmov s17, r4
-- vorr d7, d7, d8
-- vorr d6, d6, d15
-- vorr d5, d5, d14
-- vorr d4, d4, d13
-- vorr d3, d3, d12
-- vorr d2, d2, d11
-- vorr d1, d1, d10
-- vorr d0, d0, d9
-- mov r4, r5, lsl r11
-- subs r12, r12, #1
-- lsr r4, r4, r11
-- vst1.32 {q0, q1}, [r0]!
-- vst1.32 {q2, q3}, [r0]
-- pld [r1, #0]
-- sub r0, r0, #32
-- bne neon_b2f_copy_64_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vldmia sp!, {q6-q8}
-- vldmia sp!, {q4,q5}
--#else
-- vpop {q6-q8}
-- vpop {q4,q5}
--#endif
-- ands r2, r2, #0x3f
-- cmp r2, #32
-- bge neon_b2f_copy_32_u
-- b neon_b2f_copy_finish_u
--neon_b2f_copy_32_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vstmdb sp!, {q4}
--#else
-- vpush {q4}
--#endif
-- add r7, r11, #32
-- movs r12, r2, lsr #5
-- vdup.u32 q4, r7
-- and r2, r2, #0x1f
--neon_b2f_copy_32_u_loop:
-- sub r1, r1, #32
-- sub r0, r0, #32
-- vld1.32 {q0, q1}, [r1]
-- vmov q2, q0
-- vmov q3, q1
-- vmov r5, s0
-- mov r4, r4, lsl r11
-- lsls r6, r10, #28
-- bcc neon_b2f_copy_32_u_b8
-- bpl neon_b2f_copy_32_u_b16
-- vshr.u64 q0, q0, #24
-- vshr.u64 q1, q1, #24
-- b neon_b2f_copy_32_unify
--neon_b2f_copy_32_u_b8:
-- vshr.u64 q0, q0, #8
-- vshr.u64 q1, q1, #8
-- b neon_b2f_copy_32_unify
--neon_b2f_copy_32_u_b16:
-- vshr.u64 q0, q0, #16
-- vshr.u64 q1, q1, #16
--neon_b2f_copy_32_unify:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vshl.u64 q2, q4, q2
-- vshl.u64 q3, q4, q3
--#else
-- vshl.u64 q2, q2, q4
-- vshl.u64 q3, q3, q4
--#endif
-- vmov s9, r4
-- vorr d3, d3, d4
-- vorr d2, d2, d7
-- vorr d1, d1, d6
-- vorr d0, d0, d5
-- mov r4, r5, lsl r11
-- subs r12, r12, #1
-- lsr r4, r4, r11
-- vst1.32 {q0, q1}, [r0]
-- pld [r1, #0]
-- bne neon_b2f_copy_32_u_loop
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- vldmia sp!, {q4}
--#else
-- vpop {q4}
--#endif
--neon_b2f_copy_finish_u:
--neon_b2f_copy_8_u:
-- movs r12, r2, lsr #0x3
-- beq neon_b2f_copy_4_u
-- mov r5, r4, lsl r11
--neon_b2f_copy_8_u_loop:
-- ldmdb r1!, {r6-r7}
-- subs r12, r12, #1
-- orr r5, r5, r7, lsr r10
-- mov r4, r7, lsl r11
-- orr r4, r4, r6, lsr r10
-- stmdb r0!, {r4-r5}
-- mov r4, r6, lsl r11
-- lsr r4, r4, r11
-- mov r5, r4, lsl r11
-- bne neon_b2f_copy_8_u_loop
-- ands r2, r2, #0x7
--neon_b2f_copy_4_u:
-- movs r12, r2, lsr #0x2
-- beq neon_b2f_last_bits_u
-- mov r5, r4, lsl r11
--neon_b2f_copy_4_u_loop:
-- ldr r6, [r1, #-4]!
-- subs r12, r12, #1
-- orr r5, r5, r6, lsr r10
-- str r5, [r0, #-4]!
-- mov r4, r6, lsl r11
-- lsr r4, r4, r11
-- mov r5, r4, lsl r11
-- bne neon_b2f_copy_4_u_loop
-- and r2, r2, #0x3
--neon_b2f_last_bits_u:
--neon_b2f_last_bits_u_loop:
-- subs r10, r10, #8
-- mov r5, r4, lsr r10
-- strb r5, [r0, #-1]!
-- bne neon_b2f_last_bits_u_loop
--neon_b2f_copy_1_u:
-- cmp r2, #0
-- beq neon_b2f_finish_u
--neon_b2f_copy_1_u_loop:
-- ldrb r12, [r1, #-1]!
-- subs r2, r2, #1
-- strb r12, [r0, #-1]!
-- bne neon_b2f_copy_1_u_loop
--neon_b2f_finish_u:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- ldmia sp!, {r6-r11}
--#else
-- pop {r6-r11}
--#endif
--
--neon_b2f_finish:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- ldmia sp!, {r4-r5}
--#else
-- pop {r4-r5}
--#endif
--
--neon_memmove_done:
--#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-- ldmia sp!, {r0}
--#else
-- pop {r0}
--#endif
-- bx lr
--
-- .endfunc
-- .end
-diff --git git/src/neon_memsets.c git/src/neon_memsets.c
-deleted file mode 100755
-index 740fc1e..0000000
---- git/src/neon_memsets.c
-+++ /dev/null
-@@ -1,169 +0,0 @@
--/* neon_memsets.c
-- *
-- * Copyright (c) 2009, Code Aurora Forum. All rights reserved.
-- *
-- * Redistribution and use in source and binary forms, with or without
-- * modification, are permitted provided that the following conditions are met:
-- * * Redistributions of source code must retain the above copyright
-- * notice, this list of conditions and the following disclaimer.
-- * * Redistributions in binary form must reproduce the above copyright
-- * notice, this list of conditions and the following disclaimer in the
-- * documentation and/or other materials provided with the distribution.
-- * * Neither the name of Code Aurora nor
-- * the names of its contributors may be used to endorse or promote
-- * products derived from this software without specific prior written
-- * permission.
-- *
-- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-- * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-- * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-- */
--
--#include "msm-swblits.h"
--
--void memset16(uint16_t dst[], uint16_t value, int count)
--{
-- if (count <= 0)
-- return;
--
-- asm volatile(
-- " pld [%[dst], #0] \n"
-- " cmp %[count], #4 \n"
-- " blt 6f \n"
-- " tst %[dst], #0x3 \n"
-- " strneh %[value], [%[dst]], #2 \n"
-- " subne %[count], %[count], #1 \n"
-- " vdup.u16 q8, %[value] \n"
-- " vmov q9, q8 \n"
-- " cmp %[count], #64 \n"
-- " bge 0f \n"
-- " cmp %[count], #32 \n"
-- " bge 2f \n"
-- " cmp %[count], #16 \n"
-- " bge 3f \n"
-- " cmp %[count], #8 \n"
-- " bge 4f \n"
-- " b 5f \n"
-- "0: \n"
-- " mov r12, %[count], lsr #6 \n"
-- "1: \n"
-- " vst1.16 {q8, q9}, [%[dst]]! \n"
-- " vst1.16 {q8, q9}, [%[dst]]! \n"
-- " vst1.16 {q8, q9}, [%[dst]]! \n"
-- " vst1.16 {q8, q9}, [%[dst]]! \n"
-- " subs r12, r12, #1 \n"
-- " bne 1b \n"
-- " ands %[count], %[count], #0x3f \n"
-- " beq 7f \n"
-- "2: \n"
-- " cmp %[count], #32 \n"
-- " blt 3f \n"
-- " vst1.16 {q8, q9}, [%[dst]]! \n"
-- " vst1.16 {q8, q9}, [%[dst]]! \n"
-- " subs %[count], %[count], #32 \n"
-- " beq 7f \n"
-- "3: \n"
-- " cmp %[count], #16 \n"
-- " blt 4f \n"
-- " vst1.16 {q8, q9}, [%[dst]]! \n"
-- " subs %[count], %[count], #16 \n"
-- " beq 7f \n"
-- "4: \n"
-- " cmp %[count], #8 \n"
-- " blt 5f \n"
-- " vst1.16 {q8}, [%[dst]]! \n"
-- " subs %[count], %[count], #8 \n"
-- " beq 7f \n"
-- "5: \n"
-- " cmp %[count], #4 \n"
-- " blt 6f \n"
-- " vst1.16 {d16}, [%[dst]]! \n"
-- " subs %[count], %[count], #4 \n"
-- " beq 7f \n"
-- "6: \n"
-- " cmp %[count], #0 \n"
-- " blt 7f \n"
-- " lsls %[count], #31 \n"
-- " strmih %[value], [%[dst]], #2 \n"
-- " strcsh %[value], [%[dst]], #2 \n"
-- " strcsh %[value], [%[dst]], #2 \n"
-- "7: \n"
-- // Clobbered input registers
-- : [dst] "+r" (dst), [count] "+r" (count)
-- // Unclobbered input
-- : [value] "r" (value)
-- // Clobbered registers
-- : "q8", "q9", "r12", "cc", "memory"
-- );
--}
--
--void memset32(uint32_t dst[], uint32_t value, int count)
--{
-- asm volatile(
-- " pld [%[dst], #0] \n"
-- " cmp %[count], #4 \n"
-- " blt 5f \n"
-- " vdup.u32 q8, %[value] \n"
-- " vmov q9, q8 \n"
-- " cmp %[count], #32 \n"
-- " bge 0f \n"
-- " cmp %[count], #16 \n"
-- " bge 2f \n"
-- " cmp %[count], #8 \n"
-- " bge 3f \n"
-- " b 4f \n"
-- "0: \n"
-- " mov r12, %[count], lsr #5 \n"
-- "1: \n"
-- " vst1.32 {q8, q9}, [%[dst]]! \n"
-- " vst1.32 {q8, q9}, [%[dst]]! \n"
-- " vst1.32 {q8, q9}, [%[dst]]! \n"
-- " vst1.32 {q8, q9}, [%[dst]]! \n"
-- " pld [%[dst], #0] \n"
-- " subs r12, r12, #1 \n"
-- " bne 1b \n"
-- " ands %[count], %[count], #0x1f \n"
-- " beq 6f \n"
-- "2: \n"
-- " cmp %[count], #16 \n"
-- " blt 3f \n"
-- " vst1.32 {q8, q9}, [%[dst]]! \n"
-- " vst1.32 {q8, q9}, [%[dst]]! \n"
-- " subs %[count], %[count], #16 \n"
-- " beq 6f \n"
-- "3: \n"
-- " cmp %[count], #8 \n"
-- " blt 4f \n"
-- " vst1.32 {q8, q9}, [%[dst]]! \n"
-- " subs %[count], %[count], #8 \n"
-- " beq 6f \n"
-- "4: \n"
-- " cmp %[count], #4 \n"
-- " blt 5f \n"
-- " vst1.32 {q8}, [%[dst]]! \n"
-- " subs %[count], %[count], #4 \n"
-- " beq 6f \n"
-- "5: \n"
-- " cmp %[count], #0 \n"
-- " beq 6f \n"
-- " lsls %[count], #31 \n"
-- " strmi %[value], [%[dst]], #4 \n"
-- " strcs %[value], [%[dst]], #4 \n"
-- " strcs %[value], [%[dst]], #4 \n"
-- "6: @end \n"
-- // Clobbered input registers
-- : [dst] "+r" (dst), [count] "+r" (count)
-- // Unclobbered input
-- : [value] "r" (value)
-- // Clobbered registers
-- : "q8", "q9", "r12", "cc", "memory"
-- );
--}
diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch b/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch
deleted file mode 100644
index 97ad380e27..0000000000
--- a/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch
+++ /dev/null
@@ -1,36 +0,0 @@
-commit 18515a56822fcd9c0a71240edce97ea5623b0448
-Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
-Date: Wed Feb 10 16:29:55 2010 +0100
-
- Modify Makefile.am
- Removed depencies for neon
-
-diff --git git/src/Makefile.am git/src/Makefile.am
-index 8ab1856..08da5a5 100755
---- a/src/Makefile.am
-+++ b/src/Makefile.am
-@@ -12,13 +12,7 @@ MSM_DRI_SRCS += msm-drm.c msm-dri2.c
- msm_drv_la_LIBADD += $(DRI2_LIBS)
- endif
-
--NEON_CFLAGS=-march=armv7-a -mfpu=neon -mfloat-abi=softfp
--NEON_CCASFLAGS=$(NEON_CFLAGS) -mthumb-interwork
--NEON_ASFLAGS=-k -mcpu=cortex-a8 $(NEON_CCASFLAGS)
--
--AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ $(NEON_CFLAGS) -Wall -Werror
--AM_ASFLAGS = $(NEON_ASFLAGS)
--AM_CCASFLAGS = $(NEON_CCASFLAGS)
-+AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ -Wall -Werror
-
- msm_drv_la_LTLIBRARIES = msm_drv.la
- msm_drv_la_LDFLAGS = -module -avoid-version
-@@ -37,9 +31,6 @@ msm_drv_la_SOURCES = \
- msm-swfill.c \
- msm-hwrender.c \
- msm-pixmap.c \
-- neon_memsets.c \
-- neon_memcpy.S \
-- neon_memmove.S \
- $(MSM_DRI_SRCS)
-
-
diff --git a/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch b/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch
deleted file mode 100644
index 90dd31f605..0000000000
--- a/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch
+++ /dev/null
@@ -1,116 +0,0 @@
-commit cc83ba5835d5b55347fd0c0775156493b0cf3a15
-Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
-Date: Thu Feb 11 16:26:52 2010 +0100
-
- Renaming variables for getting Xorg (xf86-video-msm) work
- under linux-leviathan (htcdream):
- cd src
- sed 's/fixed_info/fix/' -i *.h
- sed 's/fixed_info/fix/' -i *.c
-
-diff --git git/src/msm-dri.c git/src/msm-dri.c
-index a51d3bd..a74368b 100644
---- git/src/msm-dri.c
-+++ git/src/msm-dri.c
-@@ -151,10 +151,10 @@ MSMDRIScreenInit(ScreenPtr pScreen)
- pDRIInfo->ddxDriverMinorVersion = 0;
- pDRIInfo->ddxDriverPatchVersion = 0;
-
-- pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fixed_info.smem_start;
-+ pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fix.smem_start;
-
-- pDRIInfo->frameBufferSize = pMsm->fixed_info.smem_len;
-- pDRIInfo->frameBufferStride = pMsm->fixed_info.line_length;
-+ pDRIInfo->frameBufferSize = pMsm->fix.smem_len;
-+ pDRIInfo->frameBufferStride = pMsm->fix.line_length;
-
- /* FIXME: How many drawables can we do (should we do)? */
-
-diff --git git/src/msm-driver.c git/src/msm-driver.c
-index 803197f..15378f8 100755
---- git/src/msm-driver.c
-+++ git/src/msm-driver.c
-@@ -399,7 +399,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
-
- /* Get the fixed info (par) structure */
-
-- if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fixed_info)) {
-+ if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fix)) {
- xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
- "Unable to read hardware info from %s: %s\n",
- dev, strerror(errno));
-@@ -410,7 +410,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
- /* Parse the ID and figure out what version of the MDP and what
- * panel ID we have */
-
-- if (sscanf(pMsm->fixed_info.id, "msmfb%d_%x", &mdpver, &panelid) < 2) {
-+ if (sscanf(pMsm->fix.id, "msmfb%d_%x", &mdpver, &panelid) < 2) {
-
- xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
- "Unable to determine the MDP and panel type\n");
-@@ -435,7 +435,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
- * the fbdev driver to allocate memory. In the mean time, we
- * just reuse the framebuffer memory */
-
-- pScrn->videoRam = pMsm->fixed_info.smem_len;
-+ pScrn->videoRam = pMsm->fix.smem_len;
-
- /* Get the current screen setting */
- if (ioctl(pMsm->fd, FBIOGET_VSCREENINFO, &pMsm->mode_info)) {
-@@ -671,8 +671,8 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
- /* The framebuffer driver should always report the line length,
- * but in case it doesn't, we can calculate it ourselves */
-
-- if (pMsm->fixed_info.line_length) {
-- pScrn->displayWidth = pMsm->fixed_info.line_length;
-+ if (pMsm->fix.line_length) {
-+ pScrn->displayWidth = pMsm->fix.line_length;
- } else {
- pScrn->displayWidth = pMsm->mode_info.xres_virtual *
- pMsm->mode_info.bits_per_pixel / 8;
-@@ -811,7 +811,7 @@ MSMCloseScreen(int scrnIndex, ScreenPtr pScreen)
- #endif
-
- /* Unmap the framebuffer memory */
-- munmap(pMsm->fbmem, pMsm->fixed_info.smem_len);
-+ munmap(pMsm->fbmem, pMsm->fix.smem_len);
-
- pScreen->CloseScreen = pMsm->CloseScreen;
-
-@@ -857,7 +857,7 @@ MSMScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
- #endif // defined (MSMFB_GET_PAGE_PROTECTION) && defined (MSMFB_SET_PAGE_PROTECTION)
-
- /* Map the framebuffer memory */
-- pMsm->fbmem = mmap(NULL, pMsm->fixed_info.smem_len,
-+ pMsm->fbmem = mmap(NULL, pMsm->fix.smem_len,
- PROT_READ | PROT_WRITE, MAP_SHARED, pMsm->fd, 0);
-
- /* If we can't map the memory, then this is a short trip */
-diff --git git/src/msm-exa.c git/src/msm-exa.c
-index 301923f..ce16a93 100755
---- git/src/msm-exa.c
-+++ git/src/msm-exa.c
-@@ -740,8 +740,8 @@ MSMSetupExa(ScreenPtr pScreen)
- pExa->flags = EXA_OFFSCREEN_PIXMAPS;
-
- pExa->offScreenBase =
-- (pMsm->fixed_info.line_length * pMsm->mode_info.yres);
-- pExa->memorySize = pMsm->fixed_info.smem_len;
-+ (pMsm->fix.line_length * pMsm->mode_info.yres);
-+ pExa->memorySize = pMsm->fix.smem_len;
-
- /* Align pixmap offsets along page boundaries */
- pExa->pixmapOffsetAlign = 4096;
-diff --git git/src/msm.h git/src/msm.h
-index e1e2bc7..520d390 100755
---- git/src/msm.h
-+++ git/src/msm.h
-@@ -85,7 +85,7 @@ typedef struct _MSMRec
- int fd;
-
- /* Fixed and var strutures from the framebuffer */
-- struct fb_fix_screeninfo fixed_info;
-+ struct fb_fix_screeninfo fix;
- struct fb_var_screeninfo mode_info;
-
- /* Pointer to the mapped framebuffer memory */