aboutsummaryrefslogtreecommitdiffstats
path: root/packages/mplayer/files/mru-neon-h264-chrome.diff
diff options
context:
space:
mode:
Diffstat (limited to 'packages/mplayer/files/mru-neon-h264-chrome.diff')
-rw-r--r--packages/mplayer/files/mru-neon-h264-chrome.diff364
1 files changed, 364 insertions, 0 deletions
diff --git a/packages/mplayer/files/mru-neon-h264-chrome.diff b/packages/mplayer/files/mru-neon-h264-chrome.diff
new file mode 100644
index 0000000000..cb6c4ff991
--- /dev/null
+++ b/packages/mplayer/files/mru-neon-h264-chrome.diff
@@ -0,0 +1,364 @@
+From: Mans Rullgard <mans@mansr.com>
+Date: Fri, 11 Jul 2008 01:20:07 +0000 (+0100)
+Subject: ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
+X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=d3aa8f93b8a0061e0c3ac12aeed055961abfc113
+
+ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
+---
+
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 7fa02fa..36ba158 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -437,6 +437,7 @@ OBJS-$(HAVE_NEON) += armv4l/dsputil_neon.o \
+
+ ASM_OBJS-$(HAVE_NEON) += armv4l/dsputil_neon_s.o \
+ armv4l/simple_idct_neon.o \
++ armv4l/h264dsp_neon.o \
+
+ OBJS-$(HAVE_VIS) += sparc/dsputil_vis.o \
+ sparc/simple_idct_vis.o \
+diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c
+index 8a10dde..a6d86cd 100644
+--- a/libavcodec/armv4l/dsputil_neon.c
++++ b/libavcodec/armv4l/dsputil_neon.c
+@@ -42,6 +42,12 @@ void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+ void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
+ void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int);
+
++void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
++void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
++
++void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
++void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
++
+ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
+ {
+ c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
+@@ -62,6 +68,12 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
+
++ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
++ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
++
++ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
++ c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
++
+ c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_neon;
+ c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_neon;
+ }
+diff --git a/libavcodec/armv4l/h264dsp_neon.S b/libavcodec/armv4l/h264dsp_neon.S
+new file mode 100644
+index 0000000..28d9aa7
+--- /dev/null
++++ b/libavcodec/armv4l/h264dsp_neon.S
+@@ -0,0 +1,308 @@
++/*
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++ .fpu neon
++
++/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
++ .macro h264_chroma_mc8 avg=0
++ push {r4-r7, lr}
++ ldrd r4, [sp, #20]
++.if \avg
++ mov lr, r0
++.endif
++ pld [r1]
++ pld [r1, r2]
++
++ muls r7, r4, r5
++ rsb r6, r7, r5, lsl #3
++ rsb ip, r7, r4, lsl #3
++ sub r4, r7, r4, lsl #3
++ sub r4, r4, r5, lsl #3
++ add r4, r4, #64
++
++ dmb
++
++ beq 2f
++
++ add r5, r1, r2
++
++ vdup.8 d0, r4
++ lsl r4, r2, #1
++ vdup.8 d1, ip
++ vld1.64 {d4, d5}, [r1], r4
++ vdup.8 d2, r6
++ vld1.64 {d6, d7}, [r5], r4
++ vdup.8 d3, r7
++
++ vext.8 d5, d4, d5, #1
++ vext.8 d7, d6, d7, #1
++
++1: pld [r5]
++ vmull.u8 q8, d4, d0
++ vmlal.u8 q8, d5, d1
++ vld1.64 {d4, d5}, [r1], r4
++ vmlal.u8 q8, d6, d2
++ vext.8 d5, d4, d5, #1
++ vmlal.u8 q8, d7, d3
++ vmull.u8 q9, d6, d0
++ subs r3, r3, #2
++ vmlal.u8 q9, d7, d1
++ vmlal.u8 q9, d4, d2
++ vmlal.u8 q9, d5, d3
++ vrshrn.u16 d16, q8, #6
++ vld1.64 {d6, d7}, [r5], r4
++ pld [r1]
++ vrshrn.u16 d17, q9, #6
++.if \avg
++ vld1.64 {d20}, [lr,:64], r2
++ vld1.64 {d21}, [lr,:64], r2
++ vrhadd.u8 q8, q8, q10
++.endif
++ vext.8 d7, d6, d7, #1
++ vst1.64 {d16}, [r0,:64], r2
++ vst1.64 {d17}, [r0,:64], r2
++ bgt 1b
++
++ pop {r4-r7, pc}
++
++2: tst r6, r6
++ add ip, ip, r6
++ vdup.8 d0, r4
++ vdup.8 d1, ip
++
++ beq 4f
++
++ add r5, r1, r2
++ lsl r4, r2, #1
++ vld1.64 {d4}, [r1], r4
++ vld1.64 {d6}, [r5], r4
++
++3: pld [r5]
++ vmull.u8 q8, d4, d0
++ vmlal.u8 q8, d6, d1
++ vld1.64 {d4}, [r1], r4
++ vmull.u8 q9, d6, d0
++ vmlal.u8 q9, d4, d1
++ vld1.64 {d6}, [r5], r4
++ vrshrn.u16 d16, q8, #6
++ vrshrn.u16 d17, q9, #6
++.if \avg
++ vld1.64 {d20}, [lr,:64], r2
++ vld1.64 {d21}, [lr,:64], r2
++ vrhadd.u8 q8, q8, q10
++.endif
++ subs r3, r3, #2
++ pld [r1]
++ vst1.64 {d16}, [r0,:64], r2
++ vst1.64 {d17}, [r0,:64], r2
++ bgt 3b
++
++ pop {r4-r7, pc}
++
++4: vld1.64 {d4, d5}, [r1], r2
++ vld1.64 {d6, d7}, [r1], r2
++ vext.8 d5, d4, d5, #1
++ vext.8 d7, d6, d7, #1
++
++5: pld [r1]
++ subs r3, r3, #2
++ vmull.u8 q8, d4, d0
++ vmlal.u8 q8, d5, d1
++ vld1.64 {d4, d5}, [r1], r2
++ vmull.u8 q9, d6, d0
++ vmlal.u8 q9, d7, d1
++ pld [r1]
++ vext.8 d5, d4, d5, #1
++ vrshrn.u16 d16, q8, #6
++ vrshrn.u16 d17, q9, #6
++.if \avg
++ vld1.64 {d20}, [lr,:64], r2
++ vld1.64 {d21}, [lr,:64], r2
++ vrhadd.u8 q8, q8, q10
++.endif
++ vld1.64 {d6, d7}, [r1], r2
++ vext.8 d7, d6, d7, #1
++ vst1.64 {d16}, [r0,:64], r2
++ vst1.64 {d17}, [r0,:64], r2
++ bgt 5b
++
++ pop {r4-r7, pc}
++ .endm
++
++/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
++ .macro h264_chroma_mc4 avg=0
++ push {r4-r7, lr}
++ ldrd r4, [sp, #20]
++.if \avg
++ mov lr, r0
++.endif
++ pld [r1]
++ pld [r1, r2]
++
++ muls r7, r4, r5
++ rsb r6, r7, r5, lsl #3
++ rsb ip, r7, r4, lsl #3
++ sub r4, r7, r4, lsl #3
++ sub r4, r4, r5, lsl #3
++ add r4, r4, #64
++
++ dmb
++
++ beq 2f
++
++ add r5, r1, r2
++
++ vdup.8 d0, r4
++ lsl r4, r2, #1
++ vdup.8 d1, ip
++ vld1.64 {d4}, [r1], r4
++ vdup.8 d2, r6
++ vld1.64 {d6}, [r5], r4
++ vdup.8 d3, r7
++
++ vext.8 d5, d4, d5, #1
++ vext.8 d7, d6, d7, #1
++ vtrn.32 d4, d5
++ vtrn.32 d6, d7
++
++ vtrn.32 d0, d1
++ vtrn.32 d2, d3
++
++1: pld [r5]
++ vmull.u8 q8, d4, d0
++ vmlal.u8 q8, d6, d2
++ vld1.64 {d4}, [r1], r4
++ vext.8 d5, d4, d5, #1
++ vtrn.32 d4, d5
++ vmull.u8 q9, d6, d0
++ vmlal.u8 q9, d4, d2
++ vld1.64 {d6}, [r5], r4
++ vadd.i16 d16, d16, d17
++ vadd.i16 d17, d18, d19
++ vrshrn.u16 d16, q8, #6
++ subs r3, r3, #2
++ pld [r1]
++.if \avg
++ vld1.32 {d20[0]}, [lr,:32], r2
++ vld1.32 {d20[1]}, [lr,:32], r2
++ vrhadd.u8 d16, d16, d20
++.endif
++ vext.8 d7, d6, d7, #1
++ vtrn.32 d6, d7
++ vst1.32 {d16[0]}, [r0,:32], r2
++ vst1.32 {d16[1]}, [r0,:32], r2
++ bgt 1b
++
++ pop {r4-r7, pc}
++
++2: tst r6, r6
++ add ip, ip, r6
++ vdup.8 d0, r4
++ vdup.8 d1, ip
++ vtrn.32 d0, d1
++
++ beq 4f
++
++ vext.32 d1, d0, d1, #1
++ add r5, r1, r2
++ lsl r4, r2, #1
++ vld1.32 {d4[0]}, [r1], r4
++ vld1.32 {d4[1]}, [r5], r4
++
++3: pld [r5]
++ vmull.u8 q8, d4, d0
++ vld1.32 {d4[0]}, [r1], r4
++ vmull.u8 q9, d4, d1
++ vld1.32 {d4[1]}, [r5], r4
++ vadd.i16 d16, d16, d17
++ vadd.i16 d17, d18, d19
++ vrshrn.u16 d16, q8, #6
++.if \avg
++ vld1.32 {d20[0]}, [lr,:32], r2
++ vld1.32 {d20[1]}, [lr,:32], r2
++ vrhadd.u8 d16, d16, d20
++.endif
++ subs r3, r3, #2
++ pld [r1]
++ vst1.32 {d16[0]}, [r0,:32], r2
++ vst1.32 {d16[1]}, [r0,:32], r2
++ bgt 3b
++
++ pop {r4-r7, pc}
++
++4: vld1.64 {d4}, [r1], r2
++ vld1.64 {d6}, [r1], r2
++ vext.8 d5, d4, d5, #1
++ vext.8 d7, d6, d7, #1
++ vtrn.32 d4, d5
++ vtrn.32 d6, d7
++
++5: vmull.u8 q8, d4, d0
++ vmull.u8 q9, d6, d0
++ subs r3, r3, #2
++ vld1.64 {d4}, [r1], r2
++ vext.8 d5, d4, d5, #1
++ vtrn.32 d4, d5
++ vadd.i16 d16, d16, d17
++ vadd.i16 d17, d18, d19
++ pld [r1]
++ vrshrn.u16 d16, q8, #6
++.if \avg
++ vld1.32 {d20[0]}, [lr,:32], r2
++ vld1.32 {d20[1]}, [lr,:32], r2
++ vrhadd.u8 d16, d16, d20
++.endif
++ vld1.64 {d6}, [r1], r2
++ vext.8 d7, d6, d7, #1
++ vtrn.32 d6, d7
++ pld [r1]
++ vst1.32 {d16[0]}, [r0,:32], r2
++ vst1.32 {d16[1]}, [r0,:32], r2
++ bgt 5b
++
++ pop {r4-r7, pc}
++ .endm
++
++ .text
++ .align
++
++ .global ff_put_h264_chroma_mc8_neon
++ .func ff_put_h264_chroma_mc8_neon
++ff_put_h264_chroma_mc8_neon:
++ h264_chroma_mc8
++ .endfunc
++
++ .global ff_avg_h264_chroma_mc8_neon
++ .func ff_avg_h264_chroma_mc8_neon
++ff_avg_h264_chroma_mc8_neon:
++ h264_chroma_mc8 avg=1
++ .endfunc
++
++ .global ff_put_h264_chroma_mc4_neon
++ .func ff_put_h264_chroma_mc4_neon
++ff_put_h264_chroma_mc4_neon:
++ h264_chroma_mc4
++ .endfunc
++
++ .global ff_avg_h264_chroma_mc4_neon
++ .func ff_avg_h264_chroma_mc4_neon
++ff_avg_h264_chroma_mc4_neon:
++ h264_chroma_mc4 avg=1
++ .endfunc