commit d8910bf773fbecf7cdea359d4b530a3672e27180 Author: David Lanzendörfer Date: Wed Feb 10 16:18:39 2010 +0100 Removed neon because its not available in our kerneÃl and so its causing trubble (Illegal instruction) diff --git git/src/msm-swblits.h git/src/msm-swblits.h index f89f00e..a40b24b 100755 --- git/src/msm-swblits.h +++ git/src/msm-swblits.h @@ -38,16 +38,6 @@ #include #include -/* Neon intrinsics are part of the ARM or GCC compiler used. */ -/* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */ -#include - -/* These are NEON-optimized functions linked to by various tests. */ -extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes); -extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes); -extern void memset16(uint16_t *dst, uint16_t value, int count); -extern void memset32(uint32_t *dst, uint32_t value, int count); - /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */ #define BITS_PER_BYTE (8) #define BYTES_PER_16BPP_PIXEL (2) diff --git git/src/msm-swfill.c git/src/msm-swfill.c index 108fd94..3dd1ef2 100755 --- git/src/msm-swfill.c +++ git/src/msm-swfill.c @@ -212,7 +212,7 @@ memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int cou } } - +/* static inline void memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count) { @@ -333,7 +333,7 @@ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count) // Quickly fill remaining pixels (up to 7). memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count); } - +*/ static inline void memset16_Test(uint16_t *dst, uint16_t src, int count) @@ -368,7 +368,8 @@ memset16_Test(uint16_t *dst, uint16_t src, int count) // Copy remaining pixels using Neon and non-Neon instructions. // NOTE: This assumes that dst is aligned optimally for Neon instructions. - memset16_AssumesNeonAlignment((void *) dst, src, count); + //memset16_AssumesNeonAlignment((void *) dst, src, count); + memset((void *) dst, src, count); } } @@ -435,12 +436,14 @@ swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dp if (w < 32) { // For narrow rectangles, block signals only once for the entire rectangles. BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); - DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); + //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); + DO_MULTIPLE_FILLS_WITH_MEMSET(memset,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); } else { // For wider rectangles, block and unblock signals for every row. - DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + DO_MULTIPLE_FILLS_WITH_MEMSET(memset,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } } diff --git git/src/msm-swrender.c git/src/msm-swrender.c index a7a9abc..835dc03 100755 --- git/src/msm-swrender.c +++ git/src/msm-swrender.c @@ -214,160 +214,6 @@ swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src } } break; - case 7: if (xdir >= 0) { - swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); - swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir); - return TRUE; - } else { - swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir); - swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); - return TRUE; - } - break; - case 8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { - uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); - vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); - return TRUE; - } - else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { - uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); - uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); - *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; - *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; - return TRUE; - } - else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { - uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); - uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); - uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T); - uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T); - *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; - *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; - *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3; - *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4; - return TRUE; - } - else { - uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T); - uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT16_T); - uint16_t src3 = *(uint16_t *) (src+2*BYTES_PER_UINT16_T); - uint16_t src4 = *(uint16_t *) (src+3*BYTES_PER_UINT16_T); - uint16_t src5 = *(uint16_t *) (src+4*BYTES_PER_UINT16_T); - uint16_t src6 = *(uint16_t *) (src+5*BYTES_PER_UINT16_T); - uint16_t src7 = *(uint16_t *) (src+6*BYTES_PER_UINT16_T); - uint16_t src8 = *(uint16_t *) (src+7*BYTES_PER_UINT16_T); - *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1; - *(uint16_t *) (dst+1*BYTES_PER_UINT16_T) = src2; - *(uint16_t *) (dst+2*BYTES_PER_UINT16_T) = src3; - *(uint16_t *) (dst+3*BYTES_PER_UINT16_T) = src4; - *(uint16_t *) (dst+4*BYTES_PER_UINT16_T) = src5; - *(uint16_t *) (dst+5*BYTES_PER_UINT16_T) = src6; - *(uint16_t *) (dst+6*BYTES_PER_UINT16_T) = src7; - *(uint16_t *) (dst+7*BYTES_PER_UINT16_T) = src8; - return TRUE; - } - break; - case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { - uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); - vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); - vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); - return TRUE; - } - else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { - uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); - uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); - uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T); - uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T); - *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; - *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; - *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3; - *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4; - return TRUE; - } - else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { - uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); - uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); - uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T); - uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T); - uint32_t src5 = *(uint32_t *) (src+4*BYTES_PER_UINT32_T); - uint32_t src6 = *(uint32_t *) (src+5*BYTES_PER_UINT32_T); - uint32_t src7 = *(uint32_t *) (src+6*BYTES_PER_UINT32_T); - uint32_t src8 = *(uint32_t *) (src+7*BYTES_PER_UINT32_T); - *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; - *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; - *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3; - *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4; - *(uint32_t *) (dst+4*BYTES_PER_UINT32_T) = src5; - *(uint32_t *) (dst+5*BYTES_PER_UINT32_T) = src6; - *(uint32_t *) (dst+6*BYTES_PER_UINT32_T) = src7; - *(uint32_t *) (dst+7*BYTES_PER_UINT32_T) = src8; - return TRUE; - } - else { - // Don't bother unrolling loops here, since that won't help for more than around 8 operations. - // Instead, just call multiple fixed functions. - if (xdir >= 0) { - swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); - swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir); - } else { - swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir); - swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); - } - return TRUE; - } - break; - case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { - uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T)); - uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T)); - vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); - vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); - vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3); - vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4); - return TRUE; - } - else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { - uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); - uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); - uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T); - uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T); - uint64_t src5 = *(uint64_t *) (src+4*BYTES_PER_UINT64_T); - uint64_t src6 = *(uint64_t *) (src+5*BYTES_PER_UINT64_T); - uint64_t src7 = *(uint64_t *) (src+6*BYTES_PER_UINT64_T); - uint64_t src8 = *(uint64_t *) (src+7*BYTES_PER_UINT64_T); - *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; - *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; - *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3; - *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4; - *(uint64_t *) (dst+4*BYTES_PER_UINT64_T) = src5; - *(uint64_t *) (dst+5*BYTES_PER_UINT64_T) = src6; - *(uint64_t *) (dst+6*BYTES_PER_UINT64_T) = src7; - *(uint64_t *) (dst+7*BYTES_PER_UINT64_T) = src8; - return TRUE; - } - break; - case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { - uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T)); - uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T)); - uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T)); - uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T)); - uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T)); - uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T)); - vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); - vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); - vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3); - vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4); - vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5); - vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6); - vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7); - vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8); - return TRUE; - } - break; } return FALSE; @@ -519,427 +365,6 @@ swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *sr } return TRUE; break; - case 7: if (xdir >= 0) { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); - } else { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); - } - return TRUE; - break; - case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); - vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); - vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); - uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); - *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; - *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; - *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; - *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); - uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); - uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); - uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); - uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); - uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); - uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); - uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); - *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; - *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; - *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; - *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; - *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; - *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; - *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; - *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { - uint16_t src1a = *(uint16_t *) (src+0*spitch+0); - uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); - uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); - uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); - uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); - uint16_t src1b = *(uint16_t *) (src+1*spitch+0); - uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); - uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); - uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); - uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); - *(uint16_t *) (dst+0*dpitch+0) = src1a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; - *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; - *(uint16_t *) (dst+1*dpitch+0) = src1b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; - *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; - return TRUE; - } - else { - uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); - uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); - uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); - uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T); - uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T); - uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T); - uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T); - uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T); - uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); - uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); - uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); - uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T); - uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T); - uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T); - uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T); - uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T); - *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; - *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; - *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; - *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a; - *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a; - *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a; - *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a; - *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a; - *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; - *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; - *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; - *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b; - *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b; - *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b; - *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b; - *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b; - return TRUE; - } - break; - case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); - vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); - vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); - vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); - vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); - uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); - uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); - uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); - uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); - uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); - *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; - *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; - *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; - *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; - *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; - *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; - *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; - *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) { - uint32_t src1a = *(uint32_t *) (src+0*spitch+0); - uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); - uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); - uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); - uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); - uint32_t src1b = *(uint32_t *) (src+1*spitch+0); - uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); - uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); - uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); - uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); - *(uint32_t *) (dst+0*dpitch+0) = src1a; - *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a; - *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a; - *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a; - *(uint32_t *) (dst+1*dpitch+0) = src1b; - *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b; - *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b; - *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b; - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); - uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); - uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); - uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); - uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T); - uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T); - uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T); - uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T); - uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); - uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); - uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); - uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); - uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T); - uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T); - uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T); - uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T); - *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; - *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; - *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; - *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; - *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a; - *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a; - *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a; - *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a; - *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; - *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; - *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; - *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; - *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b; - *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b; - *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b; - *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b; - return TRUE; - } - else { - // Don't bother unrolling loops, since that won't help for more than around 8 operations. - // Instead, just call multiple fixed functions. - if (xdir >= 0) { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - } else { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); - } - return TRUE; - } - break; - case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); - uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); - uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); - uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); - vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); - vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); - vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); - vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); - vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); - vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); - vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); - vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) { - if (xdir >= 0) { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); - } else { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); - } - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); - uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); - uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); - uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T); - uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T); - uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T); - uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T); - uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); - uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); - uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); - uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T); - uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T); - uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T); - uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T); - *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; - *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; - *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; - *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; - *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T) = src5a; - *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T) = src6a; - *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T) = src7a; - *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T) = src8a; - *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; - *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; - *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; - *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; - *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T) = src5b; - *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T) = src6b; - *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T) = src7b; - *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T) = src8b; - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { - if (xdir >= 0) { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); - } else { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); - } - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { - if (xdir >= 0) { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); - } else { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); - } - return TRUE; - } - else { - if (xdir >= 0) { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - } else { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - } - return TRUE; - } - break; - case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); - uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); - uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T)); - uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T)); - uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T)); - uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T)); - uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); - uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); - uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T)); - uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T)); - uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T)); - uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T)); - vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); - vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); - vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); - vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); - vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a); - vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4_T),src6a); - vst1q_u32((uint32_t *)(dst+0*dpitch+6*BYTES_PER_UINT32X4_T),src7a); - vst1q_u32((uint32_t *)(dst+0*dpitch+7*BYTES_PER_UINT32X4_T),src8a); - vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); - vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); - vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); - vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); - vst1q_u32((uint32_t *)(dst+1*dpitch+4*BYTES_PER_UINT32X4_T),src5b); - vst1q_u32((uint32_t *)(dst+1*dpitch+5*BYTES_PER_UINT32X4_T),src6b); - vst1q_u32((uint32_t *)(dst+1*dpitch+6*BYTES_PER_UINT32X4_T),src7b); - vst1q_u32((uint32_t *)(dst+1*dpitch+7*BYTES_PER_UINT32X4_T),src8b); - return TRUE; - }//HERE - else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) { - if (xdir >= 0) { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); - } else { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); - } - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { - if (xdir >= 0) { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); - } else { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); - } - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { - if (xdir >= 0) { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); - } else { - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); - } - return TRUE; - } - break; } return FALSE; @@ -1161,484 +586,7 @@ swCopy16BppSmallFixedWidths4Rows_Unaligned(unsigned char *dst, unsigned char *sr } return TRUE; break; - case 7: if (xdir >= 0) { - swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); - } else { - swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); - } - return TRUE; - break; - // TODO: Add more alignment checks for 8 pixel-wide cases for performance reasons? - // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,DOUBLE_WORD_ALIGNMENT_BYTE_SIZE/2)) and related half-aligned cases... - case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); - vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); - vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); - vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); - vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); - uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); - uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T); - uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T); - *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; - *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; - *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; - *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; - *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c; - *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c; - *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d; - *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d; - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); - uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); - uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); - uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); - uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); - uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); - uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); - uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); - uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T); - uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T); - uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T); - uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T); - uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T); - uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T); - uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T); - uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T); - *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; - *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; - *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; - *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; - *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; - *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; - *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; - *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; - *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c; - *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c; - *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c; - *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c; - *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d; - *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d; - *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d; - *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d; - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { - uint16_t src1a = *(uint16_t *) (src+0*spitch+0); - uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); - uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); - uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); - uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); - uint16_t src1b = *(uint16_t *) (src+1*spitch+0); - uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); - uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); - uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); - uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); - uint16_t src1c = *(uint16_t *) (src+2*spitch+0); - uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); - uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); - uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); - uint16_t src5c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); - uint16_t src1d = *(uint16_t *) (src+3*spitch+0); - uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); - uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); - uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); - uint16_t src5d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); - *(uint16_t *) (dst+0*dpitch+0) = src1a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; - *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; - *(uint16_t *) (dst+1*dpitch+0) = src1b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; - *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; - *(uint16_t *) (dst+2*dpitch+0) = src1c; - *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c; - *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c; - *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c; - *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c; - *(uint16_t *) (dst+3*dpitch+0) = src1d; - *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d; - *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d; - *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d; - *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5d; - return TRUE; - } - else { - uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); - uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); - uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); - uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T); - uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T); - uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T); - uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T); - uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T); - uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); - uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); - uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); - uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T); - uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T); - uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T); - uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T); - uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T); - uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T); - uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T); - uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T); - uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T); - uint16_t src5c = *(uint16_t *) (src+2*spitch+4*BYTES_PER_UINT16_T); - uint16_t src6c = *(uint16_t *) (src+2*spitch+5*BYTES_PER_UINT16_T); - uint16_t src7c = *(uint16_t *) (src+2*spitch+6*BYTES_PER_UINT16_T); - uint16_t src8c = *(uint16_t *) (src+2*spitch+7*BYTES_PER_UINT16_T); - uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T); - uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T); - uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T); - uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T); - uint16_t src5d = *(uint16_t *) (src+3*spitch+4*BYTES_PER_UINT16_T); - uint16_t src6d = *(uint16_t *) (src+3*spitch+5*BYTES_PER_UINT16_T); - uint16_t src7d = *(uint16_t *) (src+3*spitch+6*BYTES_PER_UINT16_T); - uint16_t src8d = *(uint16_t *) (src+3*spitch+7*BYTES_PER_UINT16_T); - *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; - *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; - *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; - *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a; - *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a; - *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a; - *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a; - *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a; - *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; - *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; - *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; - *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b; - *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b; - *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b; - *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b; - *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b; - *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T) = src1c; - *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T) = src2c; - *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T) = src3c; - *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T) = src4c; - *(uint16_t *) (dst+2*dpitch+4*BYTES_PER_UINT16_T) = src5c; - *(uint16_t *) (dst+2*dpitch+5*BYTES_PER_UINT16_T) = src6c; - *(uint16_t *) (dst+2*dpitch+6*BYTES_PER_UINT16_T) = src7c; - *(uint16_t *) (dst+2*dpitch+7*BYTES_PER_UINT16_T) = src8c; - *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T) = src1d; - *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T) = src2d; - *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T) = src3d; - *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T) = src4d; - *(uint16_t *) (dst+3*dpitch+4*BYTES_PER_UINT16_T) = src5d; - *(uint16_t *) (dst+3*dpitch+5*BYTES_PER_UINT16_T) = src6d; - *(uint16_t *) (dst+3*dpitch+6*BYTES_PER_UINT16_T) = src7d; - *(uint16_t *) (dst+3*dpitch+7*BYTES_PER_UINT16_T) = src8d; - return TRUE; - } - break; - case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T)); - vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); - vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); - vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); - vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); - vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); - vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c); - vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); - vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d); - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); - uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); - uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); - uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); - uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); - uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); - uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T); - uint64_t src3c = *(uint64_t *) (src+2*spitch+2*BYTES_PER_UINT64_T); - uint64_t src4c = *(uint64_t *) (src+2*spitch+3*BYTES_PER_UINT64_T); - uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T); - uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T); - uint64_t src3d = *(uint64_t *) (src+3*spitch+2*BYTES_PER_UINT64_T); - uint64_t src4d = *(uint64_t *) (src+3*spitch+3*BYTES_PER_UINT64_T); - *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; - *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; - *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; - *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; - *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; - *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; - *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; - *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; - *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c; - *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c; - *(uint64_t *) (dst+2*dpitch+2*BYTES_PER_UINT64_T) = src3c; - *(uint64_t *) (dst+2*dpitch+3*BYTES_PER_UINT64_T) = src4c; - *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d; - *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d; - *(uint64_t *) (dst+3*dpitch+2*BYTES_PER_UINT64_T) = src3d; - *(uint64_t *) (dst+3*dpitch+3*BYTES_PER_UINT64_T) = src4d; - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { - uint32_t src1a = *(uint32_t *) (src+0*spitch+0); - uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); - uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); - uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); - uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); - uint32_t src1b = *(uint32_t *) (src+1*spitch+0); - uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); - uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); - uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); - uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); - uint32_t src1c = *(uint32_t *) (src+2*spitch+0); - uint64_t src2c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); - uint64_t src3c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); - uint64_t src4c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); - uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); - uint32_t src1d = *(uint32_t *) (src+3*spitch+0); - uint64_t src2d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); - uint64_t src3d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); - uint64_t src4d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); - uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); - *(uint32_t *) (dst+0*dpitch+0) = src1a; - *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a; - *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a; - *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a; - *(uint32_t *) (dst+1*dpitch+0) = src1b; - *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b; - *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b; - *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b; - *(uint32_t *) (dst+2*dpitch+0) = src1c; - *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2c; - *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3c; - *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4c; - *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5c; - *(uint32_t *) (dst+3*dpitch+0) = src1d; - *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2d; - *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3d; - *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4d; - *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5d; - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); - uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); - uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); - uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); - uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T); - uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T); - uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T); - uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T); - uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); - uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); - uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); - uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); - uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T); - uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T); - uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T); - uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T); - uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T); - uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T); - uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T); - uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T); - uint32_t src5c = *(uint32_t *) (src+2*spitch+4*BYTES_PER_UINT32_T); - uint32_t src6c = *(uint32_t *) (src+2*spitch+5*BYTES_PER_UINT32_T); - uint32_t src7c = *(uint32_t *) (src+2*spitch+6*BYTES_PER_UINT32_T); - uint32_t src8c = *(uint32_t *) (src+2*spitch+7*BYTES_PER_UINT32_T); - uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T); - uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T); - uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T); - uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T); - uint32_t src5d = *(uint32_t *) (src+3*spitch+4*BYTES_PER_UINT32_T); - uint32_t src6d = *(uint32_t *) (src+3*spitch+5*BYTES_PER_UINT32_T); - uint32_t src7d = *(uint32_t *) (src+3*spitch+6*BYTES_PER_UINT32_T); - uint32_t src8d = *(uint32_t *) (src+3*spitch+7*BYTES_PER_UINT32_T); - *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; - *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; - *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; - *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; - *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a; - *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a; - *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a; - *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a; - *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; - *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; - *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; - *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; - *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b; - *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b; - *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b; - *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b; - *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c; - *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c; - *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c; - *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c; - *(uint32_t *) (dst+2*dpitch+4*BYTES_PER_UINT32_T) = src5c; - *(uint32_t *) (dst+2*dpitch+5*BYTES_PER_UINT32_T) = src6c; - *(uint32_t *) (dst+2*dpitch+6*BYTES_PER_UINT32_T) = src7c; - *(uint32_t *) (dst+2*dpitch+7*BYTES_PER_UINT32_T) = src8c; - *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d; - *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d; - *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d; - *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d; - *(uint32_t *) (dst+3*dpitch+4*BYTES_PER_UINT32_T) = src5d; - *(uint32_t *) (dst+3*dpitch+5*BYTES_PER_UINT32_T) = src6d; - *(uint32_t *) (dst+3*dpitch+6*BYTES_PER_UINT32_T) = src7d; - *(uint32_t *) (dst+3*dpitch+7*BYTES_PER_UINT32_T) = src8d; - return TRUE; - } - else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { - uint16_t src1a = *(uint16_t *) (src+0*spitch+0); - uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); - uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); - uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); - uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); - uint32_t src6a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); - uint32_t src7a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); - uint32_t src8a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); - uint16_t src9a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); - uint16_t src1b = *(uint16_t *) (src+1*spitch+0); - uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); - uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); - uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); - uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); - uint32_t src6b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); - uint32_t src7b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); - uint32_t src8b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); - uint16_t src9b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); - uint16_t src1c = *(uint16_t *) (src+2*spitch+0); - uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); - uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); - uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); - uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); - uint32_t src6c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); - uint32_t src7c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); - uint32_t src8c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); - uint16_t src9c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); - uint16_t src1d = *(uint16_t *) (src+3*spitch+0); - uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); - uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); - uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); - uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); - uint32_t src6d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); - uint32_t src7d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); - uint32_t src8d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); - uint16_t src9d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); - *(uint16_t *) (dst+0*dpitch+0) = src1a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7a; - *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8a; - *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9a; - *(uint16_t *) (dst+1*dpitch+0) = src1b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7b; - *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8b; - *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9b; - *(uint16_t *) (dst+2*dpitch+0) = src1c; - *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c; - *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c; - *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c; - *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c; - *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6c; - *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7c; - *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8c; - *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9c; - *(uint16_t *) (dst+3*dpitch+0) = src1d; - *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d; - *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d; - *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d; - *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5d; - *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6d; - *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7d; - *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8d; - *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9d; - return TRUE; - } - else { - // Don't bother unrolling loops, since that won't help for more than around 8 operations. - // Instead, just call multiple fixed functions. - if (xdir >= 0) { - swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - } else { - swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); - swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); - } - return TRUE; - } - break; - // TODO: Add more alignment checks for 32 pixel-wide cases for performance reasons? - // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,XXX)) and related cases could make a big difference here... - case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { - uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); - uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); - uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); - uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); - uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src3c = vld1q_u32((uint32_t *)(src+2*spitch+2*BYTES_PER_UINT32X4_T)); - uint32x4_t src4c = vld1q_u32((uint32_t *)(src+2*spitch+3*BYTES_PER_UINT32X4_T)); - uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); - uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T)); - uint32x4_t src3d = vld1q_u32((uint32_t *)(src+3*spitch+2*BYTES_PER_UINT32X4_T)); - uint32x4_t src4d = vld1q_u32((uint32_t *)(src+3*spitch+3*BYTES_PER_UINT32X4_T)); - vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); - vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); - vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); - vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); - vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); - vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); - vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); - vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); - vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); - vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c); - vst1q_u32((uint32_t *)(dst+2*dpitch+2*BYTES_PER_UINT32X4_T),src3c); - vst1q_u32((uint32_t *)(dst+2*dpitch+3*BYTES_PER_UINT32X4_T),src4c); - vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); - vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d); - vst1q_u32((uint32_t *)(dst+3*dpitch+2*BYTES_PER_UINT32X4_T),src3d); - vst1q_u32((uint32_t *)(dst+3*dpitch+3*BYTES_PER_UINT32X4_T),src4d); - return TRUE; - } - break; - } + } return FALSE; } @@ -1924,10 +872,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int if (rowsOverlap) { if (w > 64) { - DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } else if (w == 64) { - DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } else { DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP); @@ -1936,10 +886,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int else { if (w > 64) { - DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } else if (w == 64) { - DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } else { DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP); @@ -1973,7 +925,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i if (xdir >= 0 || !rowsOverlap) { if (w >= 128) { BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); - neon_memcpy(dst, src, w); + //neon_memcpy(dst, src, w); + memcpy(dst, src, w); UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); } else @@ -1982,7 +935,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i else { if (w >= 128) { BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); - neon_memmove(dst, src, w); + //neon_memmove(dst, src, w); + memmove(dst, src, w); UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); } else @@ -2029,7 +983,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, if (xdir >= 0 || !rowsOverlap) { if (w >= 42) { BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); - neon_memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL); + //neon_memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL); + memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL); UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); } else @@ -2038,7 +993,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, else { if (w >= 42) { BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); - neon_memmove(dst, src, w * BYTES_PER_24BPP_PIXEL); + //neon_memmove(dst, src, w * BYTES_PER_24BPP_PIXEL); + memmove(dst, src, w * BYTES_PER_24BPP_PIXEL); UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); } else diff --git git/src/neon_memcpy.S git/src/neon_memcpy.S deleted file mode 100644 index 5ecc5ce..0000000 --- git/src/neon_memcpy.S +++ /dev/null @@ -1,549 +0,0 @@ -/*************************************************************************** - Copyright (c) 2009, Code Aurora Forum. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of Code Aurora nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. - ***************************************************************************/ - -/*************************************************************************** - Neon memcpy: Attempts to do a memcpy with Neon registers if possible, - Inputs: - dest: The destination buffer - src: The source buffer - n: The size of the buffer to transfer - Outputs: - -***************************************************************************/ - -/* - * General note: - * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP - * However, it looks like the 2006 CodeSourcery Assembler has issues generating - * the correct object code for VPOP, resulting in horrific stack crashes. - * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB, - * and VPOP->VLDMIA. We can revert this back once we update our toolchain. - * - * Also, VSHL swaps the source register and the shift-amount register - * around in 2006-q3. I've coded this incorrectly so it turns out correct - * in the object code, but we'll need to undo that later... - */ - - .code 32 - .align 4 - .globl neon_memcpy - .func - -neon_memcpy: - /* - * First, make sure we're not copying < 4 bytes. If so, we'll - * just handle it here. - */ -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - stmdb sp!, {r0} -#else - push {r0} -#endif - cmp r2, #4 - bgt neon_gt_4 - /* Copy 0-4 bytes, if needed, and return.*/ - cmp r2, #0 -neon_smallcopy_loop: - beq neon_smallcopy_done - ldrb r12, [r1], #1 - subs r2, r2, #1 - strb r12, [r0], #1 - b neon_smallcopy_loop -neon_smallcopy_done: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - ldmia sp!, {r0} -#else - pop {r0} -#endif - bx lr - - /* Copy 4 or more bytes*/ -neon_gt_4: - /* Preload what we can...*/ - pld [r0,#0] - pld [r1,#0] -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - stmdb sp!, {r4-r5} -#else - push {r4-r5} -#endif - -neon_check_align: - /* Check normal word alignment for target. */ - ands r12, r0, #0x3 - beq source_alignment_check - - /* - * Target is not aligned. Step through until we get that - * word-aligned. This works better than a loop, according - * to our pipeline modeler. - */ - cmp r12, #2 - ldrb r3, [r1], #1 - ldrleb r4, [r1], #1 - ldrltb r5, [r1], #1 - rsb r12, r12, #4 - sub r2, r2, r12 - strb r3, [r0], #1 - strleb r4, [r0], #1 - strltb r5, [r0], #1 - -source_alignment_check: - ands r12, r1, #0x3 - bne neon_memcpy_nonaligned /* Source is not word aligned.*/ -neon_try_16_align: - cmp r2, #64 - blt neon_align_route - /* This is where we try 16-byte alignment. */ - ands r12, r0, #0xf - beq neon_align_route - rsb r12, r12, #16 -neon_16_start: - sub r2, r2, r12 - lsrs r3, r12, #2 -neon_align_16_4: - ldr r4, [r1], #4 - subs r3, r3, #1 - str r4, [r0], #4 - bne neon_align_16_4 -neon_align_route: - /* In this case, both source and target are word-aligned. */ - cmp r2, #32768 - bge neon_copy_128p_a - cmp r2, #256 - bge neon_copy_128_a - cmp r2, #64 - bge neon_copy_32_a - b neon_copy_finish_a - nop -neon_copy_128p_a: - /* We'll copy blocks 128-bytes at a time, but try to call pld to - * load in the next page, if possible. - */ -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vstmdb sp!, {q4-q7} -#else - vpush {q4-q7} -#endif - mov r12, r2, lsr #7 -neon_copy_128p_loop_a: - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1]! - vld1.32 {q4, q5}, [r1]! - vld1.32 {q6, q7}, [r1]! - pld [r1, #0] - pld [r1, #1024] - vst1.32 {q0, q1}, [r0]! - vst1.32 {q2, q3}, [r0]! - vst1.32 {q4, q5}, [r0]! - vst1.32 {q6, q7}, [r0]! - subs r12, r12, #1 - bne neon_copy_128p_loop_a -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vldmia sp!, {q4-q7} -#else - vpop {q4-q7} -#endif - ands r2, r2, #0x7f - beq neon_end - cmp r2, #32 - blt neon_copy_finish_a - b neon_copy_32_a - /* Copy blocks of 128-bytes (word-aligned) at a time*/ -neon_copy_128_a: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vstmdb sp!, {q4-q7} -#else - vpush {q4-q7} -#endif - /* - * Move to a 1-s based countdown to determine when to loop. That - * allows the subs to set the Z flag without having to explicitly - * call cmp to a value. - */ - mov r12, r2, lsr #7 -neon_copy_128_loop_a: - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1]! - vld1.32 {q4, q5}, [r1]! - vld1.32 {q6, q7}, [r1]! - pld [r1, #0] - pld [r1, #128] - vst1.32 {q0, q1}, [r0]! - vst1.32 {q2, q3}, [r0]! - vst1.32 {q4, q5}, [r0]! - vst1.32 {q6, q7}, [r0]! - subs r12, r12, #1 - pld [r0, #0] - pld [r0, #128] - bne neon_copy_128_loop_a -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vldmia sp!, {q4-q7} -#else - vpop {q4-q7} -#endif - ands r2, r2, #0x7f - beq neon_end - cmp r2, #32 - blt neon_copy_finish_a - /* Copy blocks of 32-bytes (word aligned) at a time*/ -neon_copy_32_a: - mov r12, r2, lsr #5 -neon_copy_32_loop_a: - vld1.32 {q0,q1}, [r1]! - subs r12, r12, #1 - pld [r1,#0] - vst1.32 {q0,q1}, [r0]! - bne neon_copy_32_loop_a - ands r2, r2, #0x1f - beq neon_end -neon_copy_finish_a: -neon_copy_16_a: - movs r12, r2, lsr #4 - beq neon_copy_8_a -neon_copy_16_a_loop: - vld1.32 {q0}, [r1]! - subs r12, r12, #1 - vst1.32 {q0}, [r0]! - bne neon_copy_16_a_loop - ands r2, r2, #0xf - beq neon_end -neon_copy_8_a: - cmp r2, #8 - blt neon_copy_4_a - ldm r1!, {r4-r5} - subs r2, r2, #8 - stm r0!, {r4-r5} - /* Copy 4-bytes of word-aligned data at a time*/ -neon_copy_4_a: - cmp r2, #4 - blt neon_copy_finish - ldr r4, [r1], #4 - subs r2, r2, #4 - str r4, [r0], #4 - b neon_copy_finish - - /* - * Handle unaligned data. The basic concept here is that we'll - * try to pull out enough data from the source to get that word- - * aligned, then do our writes word-aligned, storing the difference - * in a register, and shifting the data as needed. - */ -neon_memcpy_nonaligned: - /* - * If this is <8 bytes, it makes more sense to just copy it - * quickly instead of incurring all kinds of overhead. - */ - cmp r2, #8 /* Let's try this...*/ - ble neon_copy_finish - /* - * This is where we'll pull out either 1, 2, or 3 bytes of data - * from the source as needed to align it, then store off those - * bytes in r4. When we read in the (now) aligned data from the - * source, we'll shift the bytes and AND in the r4 data, then write - * to the target aligned. - * - * The conditional ldr calls work slightly faster than the - * previous method, confirmed by our pipeline modeler. - */ -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - stmdb sp!, {r6-r9} -#else - push {r6-r9} -#endif - cmp r12, #2 - ldrb r4, [r1], #1 - ldrleb r5, [r1], #1 - ldrltb r6, [r1], #1 - rsb r8, r12, #4 - sub r2, r2, r8 - lsl r8, r8, #3 - orrle r4, r4, r5, lsl #8 - orrlt r4, r4, r6, lsl #16 - rsb r9, r8, #32 - - cmp r2, #64 - blt neon_unaligned_route - ands r12, r0, #0xf - beq neon_unaligned_route - rsb r12, r12, #16 -neon_16_start_u: - sub r2, r2, r12 - lsrs r6, r12, #2 -neon_align_16_4_u: - ldr r5, [r1], #4 - subs r6, r6, #1 - orr r4, r4, r5, lsl r8 - str r4, [r0], #4 - mov r4, r5, lsr r9 - bne neon_align_16_4_u -neon_unaligned_route: - /* Decide which loop block to branch to.*/ - cmp r2, #256 - bge neon_copy_64_u - cmp r2, #64 - bge neon_copy_32_u - b neon_copy_finish_u - /* Copy data in 64-byte blocks.*/ -neon_copy_64_u: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vstmdb sp!, {q4} - vstmdb sp!, {q5-q8} -#else - vpush {q4} - vpush {q5-q8} -#endif - /* We'll need this for the q register shift later.*/ - vdup.u32 q8, r8 - /* - * As above, we determine how many times we can go through the - * 64-byte copy loop, then countdown. - */ - mov r12, r2, lsr #6 - and r2, r2, #0x3f -neon_copy_64_u_loop: - /* Load 64-bytes into q4-q7.*/ - vld1.32 {q4, q5}, [r1]! - vld1.32 {q6, q7}, [r1]! - /* - * Shift q0-q3 right so everything but the data we need due to the - * alignment falls off the right-hand side. The branching - * is needed, since vshr requires the shift to be an immediate - * value. - */ - lsls r5, r8, #28 - bcc neon_copy_64_u_b8 - bpl neon_copy_64_u_b16 - vshr.u64 q0, q4, #40 - vshr.u64 q1, q5, #40 - vshr.u64 q2, q6, #40 - vshr.u64 q3, q7, #40 - b neon_copy_64_unify -neon_copy_64_u_b8: - vshr.u64 q0, q4, #56 - vshr.u64 q1, q5, #56 - vshr.u64 q2, q6, #56 - vshr.u64 q3, q7, #56 - b neon_copy_64_unify -neon_copy_64_u_b16: - vshr.u64 q0, q4, #48 - vshr.u64 q1, q5, #48 - vshr.u64 q2, q6, #48 - vshr.u64 q3, q7, #48 -neon_copy_64_unify: - /* - * Shift q4-q7 left by r8 bits to take the alignment into - * account. - */ -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vshl.u64 q4, q8, q4 - vshl.u64 q5, q8, q5 - vshl.u64 q6, q8, q6 - vshl.u64 q7, q8, q7 -#else - vshl.u64 q4, q4, q8 - vshl.u64 q5, q5, q8 - vshl.u64 q6, q6, q8 - vshl.u64 q7, q7, q8 -#endif - /* - * The data in s14 will be needed for the next loop iteration. Move - * that to r5. - */ - vmov r5, s14 - /* We'll vorr the shifted data with the data that needs to move back.*/ - vorr d9, d9, d0 - /* Copy the data from the previous loop into s14.*/ - vmov s14, r4 - vorr d10, d10, d1 - vorr d11, d11, d2 - vorr d12, d12, d3 - vorr d13, d13, d4 - vorr d14, d14, d5 - vorr d15, d15, d6 - vorr d8, d8, d7 - subs r12, r12, #1 - pld [r1, #0] - pld [r1, #128] - /* Save off the r5 data into r4 for the next iteration.*/ - mov r4, r5 - vst1.32 {q4, q5}, [r0]! - vst1.32 {q6, q7}, [r0]! - pld [r0, #0] - pld [r0, #128] - bne neon_copy_64_u_loop -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vldmia sp!, {q5-q8} - vldmia sp!, {q4} -#else - vpop {q5-q8} - vpop {q4} -#endif - cmp r2, #32 - bge neon_copy_32_u - b neon_copy_finish_u -neon_copy_32_u: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vstmdb sp!, {q4} -#else - vpush {q4} -#endif - vdup.u32 q4, r8 - mov r12, r2, lsr #5 - and r2, r2, #0x1f -neon_copy_32_u_loop: - vld1.32 {q0, q1}, [r1]! - lsls r5, r8, #28 - bcc neon_copy_32_u_b8 - bpl neon_copy_32_u_b16 - vshr.u64 q2, q0, #40 - vshr.u64 q3, q1, #40 - b neon_copy_32_unify -neon_copy_32_u_b8: - vshr.u64 q2, q0, #56 - vshr.u64 q3, q1, #56 - b neon_copy_32_unify -neon_copy_32_u_b16: - vshr.u64 q2, q0, #48 - vshr.u64 q3, q1, #48 -neon_copy_32_unify: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vshl.u64 q0, q4, q0 - vshl.u64 q1, q4, q1 -#else - vshl.u64 q0, q0, q4 - vshl.u64 q1, q1, q4 -#endif - vmov r5, s14 - vorr d1, d1, d4 - vmov s14, r4 - vorr d2, d2, d5 - vorr d3, d3, d6 - vorr d0, d0, d7 - subs r12, r12, #1 - pld [r1, #0] - mov r4, r5 - vst1.32 {q0, q1}, [r0]! - bne neon_copy_32_u_loop -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vldmia sp!, {q4} -#else - vpop {q4} -#endif -neon_copy_finish_u: -neon_copy_16_u: - movs r12, r2, lsr #4 - beq neon_copy_8_u - vdup.u32 q2, r8 - and r2, r2, #0xf -neon_copy_16_u_loop: - vld1.32 {q0}, [r1]! - lsls r5, r8, #28 - bcc neon_copy_16_u_b8 - bpl neon_copy_16_u_b16 - vshr.u64 q1, q0, #40 - b neon_copy_16_unify -neon_copy_16_u_b8: - vshr.u64 q1, q0, #56 - b neon_copy_16_unify -neon_copy_16_u_b16: - vshr.u64 q1, q0, #48 -neon_copy_16_unify: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vshl.u64 q0, q2, q0 -#else - vshl.u64 q0, q0, q2 -#endif - vmov r5, s6 - vorr d1, d1, d2 - vmov s6, r4 - vorr d0, d0, d3 - subs r12, r12, #1 - mov r4, r5 - vst1.32 {q0}, [r0]! - bne neon_copy_16_u_loop -neon_copy_8_u: - cmp r2, #8 - blt neon_copy_4_u - ldm r1!, {r6-r7} - subs r2, r2, #8 - orr r4, r4, r6, lsl r8 - mov r5, r6, lsr r9 - orr r5, r5, r7, lsl r8 - stm r0!, {r4-r5} - mov r4, r7, lsr r9 -neon_copy_4_u: - cmp r2, #4 - blt neon_copy_last_bits_u - ldr r5, [r1], #4 - subs r2, r2, #4 - orr r4, r4, r5, lsl r8 - str r4, [r0], #4 - mov r4, r5, lsr r9 -neon_copy_last_bits_u: - /* - * Remember, r8 contains the size of the data in r4 in bits, - * so to get to bytes we'll need to shift 3 places - */ - lsr r8, r8, #0x3 - /* Write out the bytes stored in r4.*/ -neon_copy_last_bits_u_loop: - strb r4, [r0], #1 - subs r8, r8, #1 - lsrne r4, r4, #8 - bne neon_copy_last_bits_u_loop -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - ldmia sp!, {r6-r9} -#else - pop {r6-r9} -#endif -neon_copy_finish: - cmp r2, #0 - beq neon_end - /* - * This just copies the data from source to target one byte - * at a time. For some small values, this makes more sense. - * Note that since this code copies data a byte at a time, - * both the aligned and unaligned paths can use it. - */ -neon_copy_finish_loop: - ldrb r4, [r1], #1 - subs r2, r2, #1 - strb r4, [r0], #1 - bne neon_copy_finish_loop -neon_end: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - ldmia sp!, {r4-r5} - ldmia sp!, {r0} -#else - pop {r4-r5} - pop {r0} -#endif - bx lr - - .endfunc - .end diff --git git/src/neon_memmove.S git/src/neon_memmove.S deleted file mode 100644 index 1bfe597..0000000 --- git/src/neon_memmove.S +++ /dev/null @@ -1,939 +0,0 @@ -/*************************************************************************** - Copyright (c) 2009, Code Aurora Forum. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of Code Aurora nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. - ***************************************************************************/ - -/*************************************************************************** - * Neon memmove: Attempts to do a memmove with Neon registers if possible, - * Inputs: - * dest: The destination buffer - * src: The source buffer - * n: The size of the buffer to transfer - * Outputs: - * - ***************************************************************************/ - -/* - * General note: - * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP - * However, it looks like the 2006 CodeSourcery Assembler has issues generating - * the correct object code for VPOP, resulting in horrific stack crashes. - * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB, - * and VPOP->VLDMIA. We can revert this back once we update our toolchain. - * - * Also, VSHL swaps the source register and the shift-amount register - * around in 2006-q3. I've coded this incorrectly so it turns out correct - * in the object code, but we'll need to undo that later... - */ - .code 32 - .align 4 - .globl neon_memmove - .func - -neon_memmove: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - stmdb sp!, {r0} -#else - push {r0} -#endif - - /* - * The requirements for memmove state that the function should - * operate as if data were being copied from the source to a - * buffer, then to the destination. This is to allow a user - * to copy data from a source and target that overlap. - * - * We can't just do byte copies front-to-back automatically, since - * there's a good chance we may have an overlap (why else would someone - * intentionally use memmove then?). - * - * We'll break this into two parts. Front-to-back, or back-to-front - * copies. - */ -neon_memmove_cmf: - cmp r0, r1 - blt neon_front_to_back_copy - bgt neon_back_to_front_copy - b neon_memmove_done - - /* ############################################################# - * Front to Back copy - */ -neon_front_to_back_copy: - /* - * For small copies, just do a quick memcpy. We can do this for - * front-to-back copies, aligned or unaligned, since we're only - * doing 1 byte at a time... - */ - cmp r2, #4 - bgt neon_f2b_gt4 - cmp r2, #0 -neon_f2b_smallcopy_loop: - beq neon_memmove_done - ldrb r12, [r1], #1 - subs r2, r2, #1 - strb r12, [r0], #1 - b neon_f2b_smallcopy_loop -neon_f2b_gt4: - /* Preload what we can...*/ - pld [r0,#0] - pld [r1,#0] - /* The window size is in r3. */ - sub r3, r1, r0 -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - stmdb sp!, {r4-r6} -#else - push {r4-r6} -#endif - -neon_f2b_check_align: - /* Check alignment. */ - ands r12, r0, #0x3 - beq neon_f2b_source_align_check - cmp r12, #2 - ldrb r4, [r1], #1 - ldrleb r5, [r1], #1 - ldrltb r6, [r1], #1 - rsb r12, r12, #4 - sub r2, r2, r12 - strb r4, [r0], #1 - strleb r5, [r0], #1 - strltb r6, [r0], #1 - -neon_f2b_source_align_check: - ands r12, r1, #0x3 - bne neon_f2b_nonaligned - -neon_f2b_try_16_align: - /* If we're >64, attempt to align on 16-bytes. Smaller amounts - * don't seem to be worth handling. */ - cmp r2, #64 - blt neon_f2b_align_route - /* This is where we try 16-byte alignment. */ - ands r12, r0, #0xf - beq neon_f2b_align_route - rsb r12, r12, #16 -neon_f2b_16_start: - sub r2, r2, r12 - lsrs r5, r12, #2 -neon_f2b_align_16_4: - ldr r4, [r1], #4 - subs r5, r5, #1 - str r4, [r0], #4 - bne neon_f2b_align_16_4 -neon_f2b_align_route: - /* ############################################################# - * Front to Back copy - aligned - */ - /* - * Note that we can't just route based on the size in r2. If that's - * larger than the overlap window in r3, we could potentially - * (and likely!) destroy data we're copying. - */ - cmp r2, r3 - movle r12, r2 - movgt r12, r3 - cmp r12, #256 - bge neon_f2b_copy_128_a - cmp r12, #64 - bge neon_f2b_copy_32_a - cmp r12, #16 - bge neon_f2b_copy_16_a - cmp r12, #8 - bge neon_f2b_copy_8_a - cmp r12, #4 - bge neon_f2b_copy_4_a - b neon_f2b_copy_1_a -neon_f2b_copy_128_a: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vstmdb sp!, {q4-q7} -#else - vpush {q4-q7} -#endif - mov r12, r2, lsr #7 -neon_f2b_copy_128_a_loop: - vld1.32 {q0,q1}, [r1]! - vld1.32 {q2,q3}, [r1]! - vld1.32 {q4,q5}, [r1]! - vld1.32 {q6,q7}, [r1]! - pld [r1, #0] - pld [r1, #128] - vst1.32 {q0,q1}, [r0]! - vst1.32 {q2,q3}, [r0]! - vst1.32 {q4,q5}, [r0]! - vst1.32 {q6,q7}, [r0]! - subs r12, r12, #1 - pld [r0, #0] - pld [r0, #128] - bne neon_f2b_copy_128_a_loop -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vldmia sp!, {q4-q7} -#else - vpop {q4-q7} -#endif - ands r2, r2, #0x7f - beq neon_f2b_finish - cmp r2, #32 - bge neon_f2b_copy_32_a - b neon_f2b_copy_finish_a -neon_f2b_copy_32_a: - mov r12, r2, lsr #5 -neon_f2b_copy_32_a_loop: - vld1.32 {q0,q1}, [r1]! - subs r12, r12, #1 - pld [r1, #0] - vst1.32 {q0,q1}, [r0]! - bne neon_f2b_copy_32_a_loop - ands r2, r2, #0x1f - beq neon_f2b_finish -neon_f2b_copy_finish_a: -neon_f2b_copy_16_a: - movs r12, r2, lsr #4 - beq neon_f2b_copy_8_a -neon_f2b_copy_16_a_loop: - vld1.32 {q0}, [r1]! - subs r12, r12, #1 - vst1.32 {q0}, [r0]! - bne neon_f2b_copy_16_a_loop - ands r2, r2, #0xf - beq neon_f2b_finish -neon_f2b_copy_8_a: - cmp r2, #8 - blt neon_f2b_copy_4_a - ldm r1!, {r4-r5} - subs r2, r2, #8 - stm r0!, {r4-r5} -neon_f2b_copy_4_a: - cmp r2, #4 - blt neon_f2b_copy_1_a - ldr r4, [r1], #4 - subs r2, r2, #4 - str r4, [r0], #4 -neon_f2b_copy_1_a: - cmp r2, #0 - beq neon_f2b_finish -neon_f2b_copy_1_a_loop: - ldrb r12, [r1], #1 - subs r2, r2, #1 - strb r12, [r0], #1 - bne neon_f2b_copy_1_a_loop - b neon_f2b_finish - - /* ############################################################# - * Front to Back copy - unaligned - */ -neon_f2b_nonaligned: - /* - * For sizes < 8, does it really make sense to do the whole shift - * party? Note that we DON'T want to call neon_f2b_copy_1_u, - * since we'll end up trying to pop r8-r11, and we DON'T want - * to do that... - */ - cmp r2, #8 - ble neon_f2b_copy_1_a - -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - stmdb sp!, {r7-r9} -#else - push {r7-r9} -#endif - cmp r12, #2 - ldrb r4, [r1], #1 - ldrleb r5, [r1], #1 - ldrltb r6, [r1], #1 - rsb r8, r12, #4 - sub r2, r2, r8 - lsl r8, r8, #3 - orrle r4, r4, r5, lsl #8 - orrlt r4, r4, r6, lsl #16 - rsb r9, r8, #32 - /* - * r4 = overflow bits - * r8 = # of bits we copied into the r4 register to align source. - * r9 = 32 - r8 - * r12 = Index counter for each size, so we determine how many times - * the given size will go into r2, then count down that # of - * times in r12. - */ - cmp r2, #64 - blt neon_f2b_unaligned_route - ands r12, r0, #0xf - beq neon_f2b_unaligned_route - cmp r3, #4 - blt neon_f2b_unaligned_route - rsb r12, r12, #16 -neon_f2b_16_start_u: - sub r2, r2, r12 - lsrs r6, r12, #2 -neon_f2b_align_16_4_u: - ldr r5, [r1], #4 - subs r6, r6, #1 - orr r4, r4, r5, lsl r8 - str r4, [r0], #4 - mov r4, r5, lsr r9 - bne neon_f2b_align_16_4_u -neon_f2b_unaligned_route: - cmp r2, r3 - movle r12, r2 - movgt r12, r3 - cmp r12, #256 - bge neon_f2b_copy_64_u - cmp r12, #64 - bge neon_f2b_copy_32_u - cmp r12, #16 - bge neon_f2b_copy_16_u - cmp r12, #8 - bge neon_f2b_copy_8_u - cmp r12, #4 - bge neon_f2b_copy_4_u - b neon_f2b_last_bits_u -neon_f2b_copy_64_u: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vstmdb sp!, {q4} - vstmdb sp!, {q5-q8} -#else - vpush {q4} - vpush {q5-q8} -#endif - vdup.u32 q8, r8 - mov r12, r2, lsr #6 - and r2, r2, #0x3f -neon_f2b_copy_64_u_loop: - vld1.32 {q4, q5}, [r1]! - vld1.32 {q6, q7}, [r1]! - lsls r5, r8, #28 - bcc neon_f2b_copy_64_u_b8 - bpl neon_f2b_copy_64_u_b16 - vshr.u64 q0, q4, #40 - vshr.u64 q1, q5, #40 - vshr.u64 q2, q6, #40 - vshr.u64 q3, q7, #40 - b neon_f2b_copy_64_unify -neon_f2b_copy_64_u_b8: - vshr.u64 q0, q4, #56 - vshr.u64 q1, q5, #56 - vshr.u64 q2, q6, #56 - vshr.u64 q3, q7, #56 - b neon_f2b_copy_64_unify -neon_f2b_copy_64_u_b16: - vshr.u64 q0, q4, #48 - vshr.u64 q1, q5, #48 - vshr.u64 q2, q6, #48 - vshr.u64 q3, q7, #48 -neon_f2b_copy_64_unify: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vshl.u64 q4, q8, q4 - vshl.u64 q5, q8, q5 - vshl.u64 q6, q8, q6 - vshl.u64 q7, q8, q7 -#else - vshl.u64 q4, q4, q8 - vshl.u64 q5, q5, q8 - vshl.u64 q6, q6, q8 - vshl.u64 q7, q7, q8 -#endif - vmov r5, s14 - vorr d9, d9, d0 - vmov s14, r4 - vorr d10, d10, d1 - vorr d11, d11, d2 - vorr d12, d12, d3 - vorr d13, d13, d4 - vorr d14, d14, d5 - vorr d15, d15, d6 - vorr d8, d8, d7 - subs r12, r12, #1 - pld [r1, #0] - pld [r1, #128] - mov r4, r5 - vst1.32 {q4, q5}, [r0]! - vst1.32 {q6, q7}, [r0]! - pld [r0, #0] - pld [r0, #128] - bne neon_f2b_copy_64_u_loop -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vldmia sp!, {q5-q8} - vldmia sp!, {q4} -#else - vpop {q5-q8} - vpop {q4} -#endif - cmp r2, #32 - bge neon_f2b_copy_32_u - b neon_f2b_copy_finish_u -neon_f2b_copy_32_u: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vstmdb sp!, {q4} -#else - vpush {q4} -#endif - vdup.u32 q4, r8 - mov r12, r2, lsr #5 - and r2, r2, #0x1f -neon_f2b_copy_32_u_loop: - vld1.32 {q0, q1}, [r1]! - lsls r5, r8, #28 - bcc neon_f2b_copy_32_u_b8 - bpl neon_f2b_copy_32_u_b16 - vshr.u64 q2, q0, #40 - vshr.u64 q3, q1, #40 - b neon_f2b_copy_32_unify -neon_f2b_copy_32_u_b8: - vshr.u64 q2, q0, #56 - vshr.u64 q3, q1, #56 - b neon_f2b_copy_32_unify -neon_f2b_copy_32_u_b16: - vshr.u64 q2, q0, #48 - vshr.u64 q3, q1, #48 -neon_f2b_copy_32_unify: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vshl.u64 q0, q4, q0 - vshl.u64 q1, q4, q1 -#else - vshl.u64 q0, q0, q4 - vshl.u64 q1, q1, q4 -#endif - vmov r5, s14 - vorr d1, d1, d4 - vmov s14, r4 - vorr d2, d2, d5 - vorr d3, d3, d6 - vorr d0, d0, d7 - subs r12, r12, #1 - pld [r1, #0] - mov r4, r5 - vst1.32 {q0, q1}, [r0]! - bne neon_f2b_copy_32_u_loop -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vldmia sp!, {q4} -#else - vpop {q4} -#endif -neon_f2b_copy_finish_u: -neon_f2b_copy_16_u: - movs r12, r2, lsr #4 - beq neon_f2b_copy_8_u - vdup.u32 q2, r8 - and r2, r2, #0xf -neon_f2b_copy_16_u_loop: - vld1.32 {q0}, [r1]! - lsls r5, r8, #28 - bcc neon_f2b_copy_16_u_b8 - bpl neon_f2b_copy_16_u_b16 - vshr.u64 q1, q0, #40 - b neon_f2b_copy_16_unify -neon_f2b_copy_16_u_b8: - vshr.u64 q1, q0, #56 - b neon_f2b_copy_16_unify -neon_f2b_copy_16_u_b16: - vshr.u64 q1, q0, #48 -neon_f2b_copy_16_unify: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vshl.u64 q0, q2, q0 -#else - vshl.u64 q0, q0, q2 -#endif - vmov r5, s6 - vorr d1, d1, d2 - vmov s6, r4 - vorr d0, d0, d3 - subs r12, r12, #1 - mov r4, r5 - vst1.32 {q0}, [r0]! - bne neon_f2b_copy_16_u_loop -neon_f2b_copy_8_u: - cmp r2, #8 - blt neon_f2b_copy_4_u - ldm r1!, {r6-r7} - subs r2, r2, #8 - orr r4, r4, r6, lsl r8 - mov r5, r6, lsr r9 - orr r5, r5, r7, lsl r8 - stm r0!, {r4-r5} - mov r4, r7, lsr r9 -neon_f2b_copy_4_u: - cmp r2, #4 - blt neon_f2b_last_bits_u - ldr r5, [r1], #4 - subs r2, r2, #4 - orr r4, r4, r5, lsl r8 - str r4, [r0], #4 - mov r4, r5, lsr r9 -neon_f2b_last_bits_u: - lsr r8, r8, #0x3 -neon_f2b_last_bits_u_loop: - strb r4, [r0], #1 - subs r8, r8, #1 - lsr r4, r4, #8 - bne neon_f2b_last_bits_u_loop -neon_f2b_copy_1_u: - cmp r2, #0 - beq neon_f2b_finish_u -neon_f2b_copy_1_u_loop: - ldrb r12, [r1], #1 - subs r2, r2, #1 - strb r12, [r0], #1 - bne neon_f2b_copy_1_u_loop -neon_f2b_finish_u: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - ldmia sp!, {r7-r9} -#else - pop {r7-r9} -#endif - /* ############################################################# - * Front to Back copy - finish - */ -neon_f2b_finish: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - ldmia sp!, {r4-r6} -#else - pop {r4-r6} -#endif - b neon_memmove_done - - /* ############################################################# - * Back to Front copy - */ -neon_back_to_front_copy: - /* - * Here, we'll want to shift to the end of the buffers. This - * actually points us one past where we need to go, but since - * we'll pre-decrement throughout, this will be fine. - */ - add r0, r0, r2 - add r1, r1, r2 - cmp r2, #4 - bgt neon_b2f_gt4 - cmp r2, #0 -neon_b2f_smallcopy_loop: - beq neon_memmove_done - ldrb r12, [r1, #-1]! - subs r2, r2, #1 - strb r12, [r0, #-1]! - b neon_b2f_smallcopy_loop -neon_b2f_gt4: - pld [r0, #0] - pld [r1, #0] - /* - * The minimum of the overlap window size and the copy size - * is in r3. - */ - sub r3, r0, r1 -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - stmdb sp!, {r4-r5} -#else - push {r4-r5} -#endif - - /* - * Check alignment. Since we'll pre-decrement as we step thru, we'll - * need to make sure we're on word-alignment. - */ -neon_b2f_check_align: - ands r12, r0, #0x3 - beq neon_b2f_source_align_check - sub r2, r2, r12 -neon_b2f_shift_align: - ldrb r4, [r1, #-1]! - subs r12, r12, #1 - strb r4, [r0, #-1]! - bne neon_b2f_shift_align -neon_b2f_source_align_check: - ands r4, r1, #0x3 - bne neon_b2f_nonaligned - -neon_b2f_try_16_align: - /* If we're >64, attempt to align on 16-bytes. Smaller amounts - * don't seem to be worth handling. */ - cmp r2, #64 - blt neon_b2f_align_route - ands r12, r0, #0xf - beq neon_b2f_align_route - /* In this case, r12 has the number of bytes to roll backward. */ -neon_b2f_16_start: - sub r2, r2, r12 - lsrs r5, r12, #2 -neon_b2f_align_16_4: - ldr r4, [r1, #-4]! - subs r5, r5, #1 - str r4, [r0, #-4]! - bne neon_b2f_align_16_4 -neon_b2f_align_route: - /* - * ############################################################# - * Back to Front copy - aligned - */ - cmp r2, r3 - movle r12, r2 - movgt r12, r3 - cmp r12, #256 - bge neon_b2f_copy_128_a - cmp r12, #64 - bge neon_b2f_copy_32_a - cmp r12, #8 - bge neon_b2f_copy_8_a - cmp r12, #4 - bge neon_b2f_copy_4_a - b neon_b2f_copy_1_a -neon_b2f_copy_128_a: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vstmdb sp!, {q4-q7} -#else - vpush {q4-q7} -#endif - movs r12, r2, lsr #7 - /* - * This irks me. There MUST be a better way to read these in and - * scan the register backward instead of making it go forward. Then - * we need to do two subtractions... - */ -neon_b2f_copy_128_a_loop: - sub r1, r1, #128 - sub r0, r0, #128 - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1]! - vld1.32 {q4, q5}, [r1]! - vld1.32 {q6, q7}, [r1]! - pld [r1, #-128] - pld [r1, #-256] - vst1.32 {q0, q1}, [r0]! - vst1.32 {q2, q3}, [r0]! - vst1.32 {q4, q5}, [r0]! - vst1.32 {q6, q7}, [r0]! - subs r12, r12, #1 - pld [r0, #-128] - pld [r0, #-256] - sub r1, r1, #128 - sub r0, r0, #128 - bne neon_b2f_copy_128_a_loop -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vldmia sp!, {q4-q7} -#else - vpop {q4-q7} -#endif - ands r2, r2, #0x7f - beq neon_b2f_finish - cmp r2, #32 - bge neon_b2f_copy_32_a - b neon_b2f_copy_finish_a -neon_b2f_copy_32_a: - mov r12, r2, lsr #5 -neon_b2f_copy_32_a_loop: - sub r1, r1, #32 - sub r0, r0, #32 - vld1.32 {q0,q1}, [r1] - subs r12, r12, #1 - vst1.32 {q0,q1}, [r0] - pld [r1, #0] - bne neon_b2f_copy_32_a_loop - ands r2, r2, #0x1f - beq neon_b2f_finish -neon_b2f_copy_finish_a: -neon_b2f_copy_8_a: - movs r12, r2, lsr #0x3 - beq neon_b2f_copy_4_a -neon_b2f_copy_8_a_loop: - ldmdb r1!, {r4-r5} - subs r12, r12, #1 - stmdb r0!, {r4-r5} - bne neon_b2f_copy_8_a_loop - and r2, r2, #0x7 -neon_b2f_copy_4_a: - movs r12, r2, lsr #0x2 - beq neon_b2f_copy_1_a - and r2, r2, #0x3 -neon_b2f_copy_4_a_loop: - ldr r4, [r1, #-4]! - subs r12, r12, #1 - str r4, [r0, #-4]! - bne neon_b2f_copy_4_a_loop -neon_b2f_copy_1_a: - cmp r2, #0 - beq neon_b2f_finish -neon_b2f_copy_1_a_loop: - ldrb r12, [r1, #-1]! - subs r2, r2, #1 - strb r12, [r0, #-1]! - bne neon_b2f_copy_1_a_loop - - /* ############################################################# - * Back to Front copy - unaligned - */ -neon_b2f_nonaligned: - /* - * For sizes < 8, does it really make sense to do the whole shift - * party? - */ - cmp r2, #8 - ble neon_b2f_copy_1_a -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - stmdb sp!, {r6-r11} -#else - push {r6-r11} -#endif - /* - * r3 = max window size - * r4 = overflow bytes - * r5 = bytes we're reading into - * r6 = # bytes we're off. - * r10 = copy of r6 - */ - and r6, r1, #0x3 - eor r4, r4, r4 - mov r10, r6 -neon_b2f_realign: - ldrb r5, [r1, #-1]! - subs r6, r6, #1 - orr r4, r5, r4, lsl #8 - bne neon_b2f_realign - /* - * r10 = # of bits we copied into the r4 register to align source. - * r11 = 32 - r10 - * r12 = Index counter for each size, so we determine how many times - * the given size will go into r2, then count down that # of - * times in r12. - */ - sub r2, r2, r10 - lsl r10, r10, #0x3 - rsb r11, r10, #32 - - cmp r2, r3 - movle r12, r2 - movgt r12, r3 - cmp r12, #256 - bge neon_b2f_copy_64_u - cmp r12, #64 - bge neon_b2f_copy_32_u - cmp r12, #8 - bge neon_b2f_copy_8_u - cmp r12, #4 - bge neon_b2f_copy_4_u - b neon_b2f_last_bits_u -neon_b2f_copy_64_u: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vstmdb sp!, {q4,q5} - vstmdb sp!, {q6-q8} -#else - vpush {q4,q5} - vpush {q6-q8} -#endif - add r7, r11, #32 - movs r12, r2, lsr #6 - vdup.u32 q8, r7 -neon_b2f_copy_64_u_loop: - sub r1, r1, #64 - sub r0, r0, #64 - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1] - sub r1, r1, #32 - vmov q4, q0 - vmov q5, q1 - vmov q6, q2 - vmov q7, q3 - vmov r5, s0 - mov r4, r4, lsl r11 - lsls r6, r10, #28 - bcc neon_b2f_copy_64_u_b8 - bpl neon_b2f_copy_64_u_b16 - vshr.u64 q0, q0, #24 - vshr.u64 q1, q1, #24 - vshr.u64 q2, q2, #24 - vshr.u64 q3, q3, #24 - b neon_b2f_copy_64_unify -neon_b2f_copy_64_u_b8: - vshr.u64 q0, q0, #8 - vshr.u64 q1, q1, #8 - vshr.u64 q2, q2, #8 - vshr.u64 q3, q3, #8 - b neon_b2f_copy_64_unify -neon_b2f_copy_64_u_b16: - vshr.u64 q0, q0, #16 - vshr.u64 q1, q1, #16 - vshr.u64 q2, q2, #16 - vshr.u64 q3, q3, #16 -neon_b2f_copy_64_unify: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vshl.u64 q4, q8, q4 - vshl.u64 q5, q8, q5 - vshl.u64 q6, q8, q6 - vshl.u64 q7, q8, q7 -#else - vshl.u64 q4, q4, q8 - vshl.u64 q5, q5, q8 - vshl.u64 q6, q6, q8 - vshl.u64 q7, q7, q8 -#endif - vmov s17, r4 - vorr d7, d7, d8 - vorr d6, d6, d15 - vorr d5, d5, d14 - vorr d4, d4, d13 - vorr d3, d3, d12 - vorr d2, d2, d11 - vorr d1, d1, d10 - vorr d0, d0, d9 - mov r4, r5, lsl r11 - subs r12, r12, #1 - lsr r4, r4, r11 - vst1.32 {q0, q1}, [r0]! - vst1.32 {q2, q3}, [r0] - pld [r1, #0] - sub r0, r0, #32 - bne neon_b2f_copy_64_u_loop -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vldmia sp!, {q6-q8} - vldmia sp!, {q4,q5} -#else - vpop {q6-q8} - vpop {q4,q5} -#endif - ands r2, r2, #0x3f - cmp r2, #32 - bge neon_b2f_copy_32_u - b neon_b2f_copy_finish_u -neon_b2f_copy_32_u: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vstmdb sp!, {q4} -#else - vpush {q4} -#endif - add r7, r11, #32 - movs r12, r2, lsr #5 - vdup.u32 q4, r7 - and r2, r2, #0x1f -neon_b2f_copy_32_u_loop: - sub r1, r1, #32 - sub r0, r0, #32 - vld1.32 {q0, q1}, [r1] - vmov q2, q0 - vmov q3, q1 - vmov r5, s0 - mov r4, r4, lsl r11 - lsls r6, r10, #28 - bcc neon_b2f_copy_32_u_b8 - bpl neon_b2f_copy_32_u_b16 - vshr.u64 q0, q0, #24 - vshr.u64 q1, q1, #24 - b neon_b2f_copy_32_unify -neon_b2f_copy_32_u_b8: - vshr.u64 q0, q0, #8 - vshr.u64 q1, q1, #8 - b neon_b2f_copy_32_unify -neon_b2f_copy_32_u_b16: - vshr.u64 q0, q0, #16 - vshr.u64 q1, q1, #16 -neon_b2f_copy_32_unify: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vshl.u64 q2, q4, q2 - vshl.u64 q3, q4, q3 -#else - vshl.u64 q2, q2, q4 - vshl.u64 q3, q3, q4 -#endif - vmov s9, r4 - vorr d3, d3, d4 - vorr d2, d2, d7 - vorr d1, d1, d6 - vorr d0, d0, d5 - mov r4, r5, lsl r11 - subs r12, r12, #1 - lsr r4, r4, r11 - vst1.32 {q0, q1}, [r0] - pld [r1, #0] - bne neon_b2f_copy_32_u_loop -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - vldmia sp!, {q4} -#else - vpop {q4} -#endif -neon_b2f_copy_finish_u: -neon_b2f_copy_8_u: - movs r12, r2, lsr #0x3 - beq neon_b2f_copy_4_u - mov r5, r4, lsl r11 -neon_b2f_copy_8_u_loop: - ldmdb r1!, {r6-r7} - subs r12, r12, #1 - orr r5, r5, r7, lsr r10 - mov r4, r7, lsl r11 - orr r4, r4, r6, lsr r10 - stmdb r0!, {r4-r5} - mov r4, r6, lsl r11 - lsr r4, r4, r11 - mov r5, r4, lsl r11 - bne neon_b2f_copy_8_u_loop - ands r2, r2, #0x7 -neon_b2f_copy_4_u: - movs r12, r2, lsr #0x2 - beq neon_b2f_last_bits_u - mov r5, r4, lsl r11 -neon_b2f_copy_4_u_loop: - ldr r6, [r1, #-4]! - subs r12, r12, #1 - orr r5, r5, r6, lsr r10 - str r5, [r0, #-4]! - mov r4, r6, lsl r11 - lsr r4, r4, r11 - mov r5, r4, lsl r11 - bne neon_b2f_copy_4_u_loop - and r2, r2, #0x3 -neon_b2f_last_bits_u: -neon_b2f_last_bits_u_loop: - subs r10, r10, #8 - mov r5, r4, lsr r10 - strb r5, [r0, #-1]! - bne neon_b2f_last_bits_u_loop -neon_b2f_copy_1_u: - cmp r2, #0 - beq neon_b2f_finish_u -neon_b2f_copy_1_u_loop: - ldrb r12, [r1, #-1]! - subs r2, r2, #1 - strb r12, [r0, #-1]! - bne neon_b2f_copy_1_u_loop -neon_b2f_finish_u: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - ldmia sp!, {r6-r11} -#else - pop {r6-r11} -#endif - -neon_b2f_finish: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - ldmia sp!, {r4-r5} -#else - pop {r4-r5} -#endif - -neon_memmove_done: -#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) - ldmia sp!, {r0} -#else - pop {r0} -#endif - bx lr - - .endfunc - .end diff --git git/src/neon_memsets.c git/src/neon_memsets.c deleted file mode 100755 index 740fc1e..0000000 --- git/src/neon_memsets.c +++ /dev/null @@ -1,169 +0,0 @@ -/* neon_memsets.c - * - * Copyright (c) 2009, Code Aurora Forum. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Code Aurora nor - * the names of its contributors may be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "msm-swblits.h" - -void memset16(uint16_t dst[], uint16_t value, int count) -{ - if (count <= 0) - return; - - asm volatile( - " pld [%[dst], #0] \n" - " cmp %[count], #4 \n" - " blt 6f \n" - " tst %[dst], #0x3 \n" - " strneh %[value], [%[dst]], #2 \n" - " subne %[count], %[count], #1 \n" - " vdup.u16 q8, %[value] \n" - " vmov q9, q8 \n" - " cmp %[count], #64 \n" - " bge 0f \n" - " cmp %[count], #32 \n" - " bge 2f \n" - " cmp %[count], #16 \n" - " bge 3f \n" - " cmp %[count], #8 \n" - " bge 4f \n" - " b 5f \n" - "0: \n" - " mov r12, %[count], lsr #6 \n" - "1: \n" - " vst1.16 {q8, q9}, [%[dst]]! \n" - " vst1.16 {q8, q9}, [%[dst]]! \n" - " vst1.16 {q8, q9}, [%[dst]]! \n" - " vst1.16 {q8, q9}, [%[dst]]! \n" - " subs r12, r12, #1 \n" - " bne 1b \n" - " ands %[count], %[count], #0x3f \n" - " beq 7f \n" - "2: \n" - " cmp %[count], #32 \n" - " blt 3f \n" - " vst1.16 {q8, q9}, [%[dst]]! \n" - " vst1.16 {q8, q9}, [%[dst]]! \n" - " subs %[count], %[count], #32 \n" - " beq 7f \n" - "3: \n" - " cmp %[count], #16 \n" - " blt 4f \n" - " vst1.16 {q8, q9}, [%[dst]]! \n" - " subs %[count], %[count], #16 \n" - " beq 7f \n" - "4: \n" - " cmp %[count], #8 \n" - " blt 5f \n" - " vst1.16 {q8}, [%[dst]]! \n" - " subs %[count], %[count], #8 \n" - " beq 7f \n" - "5: \n" - " cmp %[count], #4 \n" - " blt 6f \n" - " vst1.16 {d16}, [%[dst]]! \n" - " subs %[count], %[count], #4 \n" - " beq 7f \n" - "6: \n" - " cmp %[count], #0 \n" - " blt 7f \n" - " lsls %[count], #31 \n" - " strmih %[value], [%[dst]], #2 \n" - " strcsh %[value], [%[dst]], #2 \n" - " strcsh %[value], [%[dst]], #2 \n" - "7: \n" - // Clobbered input registers - : [dst] "+r" (dst), [count] "+r" (count) - // Unclobbered input - : [value] "r" (value) - // Clobbered registers - : "q8", "q9", "r12", "cc", "memory" - ); -} - -void memset32(uint32_t dst[], uint32_t value, int count) -{ - asm volatile( - " pld [%[dst], #0] \n" - " cmp %[count], #4 \n" - " blt 5f \n" - " vdup.u32 q8, %[value] \n" - " vmov q9, q8 \n" - " cmp %[count], #32 \n" - " bge 0f \n" - " cmp %[count], #16 \n" - " bge 2f \n" - " cmp %[count], #8 \n" - " bge 3f \n" - " b 4f \n" - "0: \n" - " mov r12, %[count], lsr #5 \n" - "1: \n" - " vst1.32 {q8, q9}, [%[dst]]! \n" - " vst1.32 {q8, q9}, [%[dst]]! \n" - " vst1.32 {q8, q9}, [%[dst]]! \n" - " vst1.32 {q8, q9}, [%[dst]]! \n" - " pld [%[dst], #0] \n" - " subs r12, r12, #1 \n" - " bne 1b \n" - " ands %[count], %[count], #0x1f \n" - " beq 6f \n" - "2: \n" - " cmp %[count], #16 \n" - " blt 3f \n" - " vst1.32 {q8, q9}, [%[dst]]! \n" - " vst1.32 {q8, q9}, [%[dst]]! \n" - " subs %[count], %[count], #16 \n" - " beq 6f \n" - "3: \n" - " cmp %[count], #8 \n" - " blt 4f \n" - " vst1.32 {q8, q9}, [%[dst]]! \n" - " subs %[count], %[count], #8 \n" - " beq 6f \n" - "4: \n" - " cmp %[count], #4 \n" - " blt 5f \n" - " vst1.32 {q8}, [%[dst]]! \n" - " subs %[count], %[count], #4 \n" - " beq 6f \n" - "5: \n" - " cmp %[count], #0 \n" - " beq 6f \n" - " lsls %[count], #31 \n" - " strmi %[value], [%[dst]], #4 \n" - " strcs %[value], [%[dst]], #4 \n" - " strcs %[value], [%[dst]], #4 \n" - "6: @end \n" - // Clobbered input registers - : [dst] "+r" (dst), [count] "+r" (count) - // Unclobbered input - : [value] "r" (value) - // Clobbered registers - : "q8", "q9", "r12", "cc", "memory" - ); -}