Makefile | 4 + arm/dsputil_neon.c | 16 ++++ arm/dsputil_neon_s.S | 178 +++++++++++++++++++++++++++++++++++++------------ arm/simple_idct_neon.S | 17 ++++ arm/vp3dsp_neon.S | 94 +++++++++++++++++++++++++ 5 files changed, 265 insertions(+), 44 deletions(-) diff -Nurd ffmpeg.old/libavcodec/arm/dsputil_neon.c ffmpeg-0.5/libavcodec/arm/dsputil_neon.c --- ffmpeg.old/libavcodec/arm/dsputil_neon.c 2009-01-31 00:13:19.000000000 +0100 +++ ffmpeg-0.5/libavcodec/arm/dsputil_neon.c 2009-05-30 11:27:54.000000000 +0200 @@ -41,6 +41,10 @@ void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int); +void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); +void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); +void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); + void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); @@ -146,6 +150,9 @@ DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); +void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); + void ff_vector_fmul_neon(float *dst, const float *src, int len); void ff_vector_fmul_window_neon(float *dst, const float *src0, const float *src1, const float *win, @@ -176,6 +183,10 @@ c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; + c->add_pixels_clamped = ff_add_pixels_clamped_neon; + c->put_pixels_clamped = ff_put_pixels_clamped_neon; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; @@ -247,6 +258,11 @@ c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; c->h264_idct_add8 = ff_h264_idct_add8_neon; + if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) { + c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon; + c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon; + } + c->vector_fmul = ff_vector_fmul_neon; c->vector_fmul_window = ff_vector_fmul_window_neon; diff -Nurd ffmpeg.old/libavcodec/arm/dsputil_neon_s.S ffmpeg-0.5/libavcodec/arm/dsputil_neon_s.S --- ffmpeg.old/libavcodec/arm/dsputil_neon_s.S 2009-01-31 00:13:19.000000000 +0100 +++ ffmpeg-0.5/libavcodec/arm/dsputil_neon_s.S 2009-05-30 11:27:54.000000000 +0200 @@ -38,13 +38,13 @@ pld [r1, r2] pld [r1, r2, lsl #1] .if \avg - vld1.64 {d16,d17}, [ip], r2 + vld1.64 {d16,d17}, [ip,:128], r2 vrhadd.u8 q0, q0, q8 - vld1.64 {d18,d19}, [ip], r2 + vld1.64 {d18,d19}, [ip,:128], r2 vrhadd.u8 q1, q1, q9 - vld1.64 {d20,d21}, [ip], r2 + vld1.64 {d20,d21}, [ip,:128], r2 vrhadd.u8 q2, q2, q10 - vld1.64 {d22,d23}, [ip], r2 + vld1.64 {d22,d23}, [ip,:128], r2 vrhadd.u8 q3, q3, q11 .endif subs r3, r3, #4 @@ -73,35 +73,29 @@ .endm .macro pixels16_y2 vhadd=vrhadd.u8 - push {lr} - add ip, r1, r2 - lsl lr, r2, #1 - vld1.64 {d0, d1}, [r1], lr - vld1.64 {d2, d3}, [ip], lr + vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d2, d3}, [r1], r2 1: subs r3, r3, #2 \vhadd q2, q0, q1 - vld1.64 {d0, d1}, [r1], lr + vld1.64 {d0, d1}, [r1], r2 \vhadd q3, q0, q1 - vld1.64 {d2, d3}, [ip], lr + vld1.64 {d2, d3}, [r1], r2 pld [r1] - pld [ip] + pld [r1, r2] vst1.64 {d4, d5}, [r0,:128], r2 vst1.64 {d6, d7}, [r0,:128], r2 bne 1b - pop {pc} + bx lr .endm .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 - push {lr} - lsl lr, r2, #1 - add ip, r1, r2 - vld1.64 {d0-d2}, [r1], lr - vld1.64 {d4-d6}, [ip], lr + vld1.64 {d0-d2}, [r1], r2 + vld1.64 {d4-d6}, [r1], r2 .if \no_rnd vmov.i16 q13, #1 .endif pld [r1] - pld [ip] + pld [r1, r2] vext.8 q1, q0, q1, #1 vext.8 q3, q2, q3, #1 vaddl.u8 q8, d0, d2 @@ -109,7 +103,7 @@ vaddl.u8 q9, d4, d6 vaddl.u8 q11, d5, d7 1: subs r3, r3, #2 - vld1.64 {d0-d2}, [r1], lr + vld1.64 {d0-d2}, [r1], r2 vadd.u16 q12, q8, q9 pld [r1] .if \no_rnd @@ -123,11 +117,11 @@ .endif \vshrn d29, q1, #2 vaddl.u8 q8, d0, d30 - vld1.64 {d2-d4}, [ip], lr + vld1.64 {d2-d4}, [r1], r2 vaddl.u8 q10, d1, d31 vst1.64 {d28,d29}, [r0,:128], r2 vadd.u16 q12, q8, q9 - pld [ip] + pld [r1, r2] .if \no_rnd vadd.u16 q12, q12, q13 .endif @@ -142,7 +136,7 @@ vaddl.u8 q11, d3, d5 vst1.64 {d30,d31}, [r0,:128], r2 bgt 1b - pop {pc} + bx lr .endm .macro pixels8 @@ -180,41 +174,35 @@ .endm .macro pixels8_y2 vhadd=vrhadd.u8 - push {lr} - add ip, r1, r2 - lsl lr, r2, #1 - vld1.64 {d0}, [r1], lr - vld1.64 {d1}, [ip], lr + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 1: subs r3, r3, #2 \vhadd d4, d0, d1 - vld1.64 {d0}, [r1], lr + vld1.64 {d0}, [r1], r2 \vhadd d5, d0, d1 - vld1.64 {d1}, [ip], lr + vld1.64 {d1}, [r1], r2 pld [r1] - pld [ip] + pld [r1, r2] vst1.64 {d4}, [r0,:64], r2 vst1.64 {d5}, [r0,:64], r2 bne 1b - pop {pc} + bx lr .endm .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 - push {lr} - lsl lr, r2, #1 - add ip, r1, r2 - vld1.64 {d0, d1}, [r1], lr - vld1.64 {d2, d3}, [ip], lr + vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d2, d3}, [r1], r2 .if \no_rnd vmov.i16 q11, #1 .endif pld [r1] - pld [ip] + pld [r1, r2] vext.8 d4, d0, d1, #1 vext.8 d6, d2, d3, #1 vaddl.u8 q8, d0, d4 vaddl.u8 q9, d2, d6 1: subs r3, r3, #2 - vld1.64 {d0, d1}, [r1], lr + vld1.64 {d0, d1}, [r1], r2 pld [r1] vadd.u16 q10, q8, q9 vext.8 d4, d0, d1, #1 @@ -223,9 +211,9 @@ .endif vaddl.u8 q8, d0, d4 \vshrn d5, q10, #2 - vld1.64 {d2, d3}, [ip], lr + vld1.64 {d2, d3}, [r1], r2 vadd.u16 q10, q8, q9 - pld [ip] + pld [r1, r2] .if \no_rnd vadd.u16 q10, q10, q11 .endif @@ -235,7 +223,7 @@ vaddl.u8 q9, d2, d6 vst1.64 {d7}, [r0,:64], r2 bgt 1b - pop {pc} + bx lr .endm .macro pixfunc pfx name suf rnd_op args:vararg @@ -273,6 +261,112 @@ pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 +function ff_put_pixels_clamped_neon, export=1 + vld1.64 {d16-d19}, [r0,:128]! + vqmovun.s16 d0, q8 + vld1.64 {d20-d23}, [r0,:128]! + vqmovun.s16 d1, q9 + vld1.64 {d24-d27}, [r0,:128]! + vqmovun.s16 d2, q10 + vld1.64 {d28-d31}, [r0,:128]! + vqmovun.s16 d3, q11 + vst1.64 {d0}, [r1,:64], r2 + vqmovun.s16 d4, q12 + vst1.64 {d1}, [r1,:64], r2 + vqmovun.s16 d5, q13 + vst1.64 {d2}, [r1,:64], r2 + vqmovun.s16 d6, q14 + vst1.64 {d3}, [r1,:64], r2 + vqmovun.s16 d7, q15 + vst1.64 {d4}, [r1,:64], r2 + vst1.64 {d5}, [r1,:64], r2 + vst1.64 {d6}, [r1,:64], r2 + vst1.64 {d7}, [r1,:64], r2 + bx lr + .endfunc + +function ff_put_signed_pixels_clamped_neon, export=1 + vmov.u8 d31, #128 + vld1.64 {d16-d17}, [r0,:128]! + vqmovn.s16 d0, q8 + vld1.64 {d18-d19}, [r0,:128]! + vqmovn.s16 d1, q9 + vld1.64 {d16-d17}, [r0,:128]! + vqmovn.s16 d2, q8 + vld1.64 {d18-d19}, [r0,:128]! + vadd.u8 d0, d0, d31 + vld1.64 {d20-d21}, [r0,:128]! + vadd.u8 d1, d1, d31 + vld1.64 {d22-d23}, [r0,:128]! + vadd.u8 d2, d2, d31 + vst1.64 {d0}, [r1,:64], r2 + vqmovn.s16 d3, q9 + vst1.64 {d1}, [r1,:64], r2 + vqmovn.s16 d4, q10 + vst1.64 {d2}, [r1,:64], r2 + vqmovn.s16 d5, q11 + vld1.64 {d24-d25}, [r0,:128]! + vadd.u8 d3, d3, d31 + vld1.64 {d26-d27}, [r0,:128]! + vadd.u8 d4, d4, d31 + vadd.u8 d5, d5, d31 + vst1.64 {d3}, [r1,:64], r2 + vqmovn.s16 d6, q12 + vst1.64 {d4}, [r1,:64], r2 + vqmovn.s16 d7, q13 + vst1.64 {d5}, [r1,:64], r2 + vadd.u8 d6, d6, d31 + vadd.u8 d7, d7, d31 + vst1.64 {d6}, [r1,:64], r2 + vst1.64 {d7}, [r1,:64], r2 + bx lr + .endfunc + +function ff_add_pixels_clamped_neon, export=1 + mov r3, r1 + vld1.64 {d16}, [r1,:64], r2 + vld1.64 {d0-d1}, [r0,:128]! + vaddw.u8 q0, q0, d16 + vld1.64 {d17}, [r1,:64], r2 + vld1.64 {d2-d3}, [r0,:128]! + vqmovun.s16 d0, q0 + vld1.64 {d18}, [r1,:64], r2 + vaddw.u8 q1, q1, d17 + vld1.64 {d4-d5}, [r0,:128]! + vaddw.u8 q2, q2, d18 + vst1.64 {d0}, [r3,:64], r2 + vqmovun.s16 d2, q1 + vld1.64 {d19}, [r1,:64], r2 + vld1.64 {d6-d7}, [r0,:128]! + vaddw.u8 q3, q3, d19 + vqmovun.s16 d4, q2 + vst1.64 {d2}, [r3,:64], r2 + vld1.64 {d16}, [r1,:64], r2 + vqmovun.s16 d6, q3 + vld1.64 {d0-d1}, [r0,:128]! + vaddw.u8 q0, q0, d16 + vst1.64 {d4}, [r3,:64], r2 + vld1.64 {d17}, [r1,:64], r2 + vld1.64 {d2-d3}, [r0,:128]! + vaddw.u8 q1, q1, d17 + vst1.64 {d6}, [r3,:64], r2 + vqmovun.s16 d0, q0 + vld1.64 {d18}, [r1,:64], r2 + vld1.64 {d4-d5}, [r0,:128]! + vaddw.u8 q2, q2, d18 + vst1.64 {d0}, [r3,:64], r2 + vqmovun.s16 d2, q1 + vld1.64 {d19}, [r1,:64], r2 + vqmovun.s16 d4, q2 + vld1.64 {d6-d7}, [r0,:128]! + vaddw.u8 q3, q3, d19 + vst1.64 {d2}, [r3,:64], r2 + vqmovun.s16 d6, q3 + vst1.64 {d4}, [r3,:64], r2 + vst1.64 {d6}, [r3,:64], r2 + bx lr + .endfunc + function ff_float_to_int16_neon, export=1 subs r2, r2, #8 vld1.64 {d0-d1}, [r1,:128]! diff -Nurd ffmpeg.old/libavcodec/arm/simple_idct_neon.S ffmpeg-0.5/libavcodec/arm/simple_idct_neon.S --- ffmpeg.old/libavcodec/arm/simple_idct_neon.S 2008-12-30 04:13:52.000000000 +0100 +++ ffmpeg-0.5/libavcodec/arm/simple_idct_neon.S 2009-05-30 11:27:54.000000000 +0200 @@ -68,6 +68,19 @@ .text .align 6 +function idct_row4_pld_neon + pld [r0] + add r3, r0, r1, lsl #2 + pld [r0, r1] + pld [r0, r1, lsl #1] + pld [r3, -r1] + pld [r3] + pld [r3, r1] + add r3, r3, r1, lsl #1 + pld [r3] + pld [r3, r1] + .endfunc + function idct_row4_neon vmov.i32 q15, #(1<<(ROW_SHIFT-1)) vld1.64 {d2-d5}, [r2,:128]! @@ -252,7 +265,7 @@ function ff_simple_idct_put_neon, export=1 idct_start r2 - bl idct_row4_neon + bl idct_row4_pld_neon bl idct_row4_neon add r2, r2, #-128 bl idct_col4_neon @@ -307,7 +320,7 @@ function ff_simple_idct_add_neon, export=1 idct_start r2 - bl idct_row4_neon + bl idct_row4_pld_neon bl idct_row4_neon add r2, r2, #-128 bl idct_col4_neon diff -Nurd ffmpeg.old/libavcodec/arm/vp3dsp_neon.S ffmpeg-0.5/libavcodec/arm/vp3dsp_neon.S --- ffmpeg.old/libavcodec/arm/vp3dsp_neon.S 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg-0.5/libavcodec/arm/vp3dsp_neon.S 2009-05-30 11:27:54.000000000 +0200 @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2009 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +.macro vp3_loop_filter + vsubl.u8 q3, d18, d17 + vsubl.u8 q2, d16, d19 + vadd.i16 q1, q3, q3 + vadd.i16 q2, q2, q3 + vadd.i16 q0, q1, q2 + vrshr.s16 q0, q0, #3 + vmovl.u8 q9, d18 + vdup.u16 q15, r2 + + vabs.s16 q1, q0 + vshr.s16 q0, q0, #15 + vqsub.u16 q2, q15, q1 + vqsub.u16 q3, q2, q1 + vsub.i16 q1, q2, q3 + veor q1, q1, q0 + vsub.i16 q0, q1, q0 + + vaddw.u8 q2, q0, d17 + vsub.i16 q3, q9, q0 + vqmovun.s16 d0, q2 + vqmovun.s16 d1, q3 +.endm + +function ff_vp3_v_loop_filter_neon, export=1 + sub ip, r0, r1 + sub r0, r0, r1, lsl #1 + vld1.64 {d16}, [r0,:64], r1 + vld1.64 {d17}, [r0,:64], r1 + vld1.64 {d18}, [r0,:64], r1 + vld1.64 {d19}, [r0,:64], r1 + ldrb r2, [r2, #129*4] + + vp3_loop_filter + + vst1.64 {d0}, [ip,:64], r1 + vst1.64 {d1}, [ip,:64], r1 + bx lr +.endfunc + +function ff_vp3_h_loop_filter_neon, export=1 + sub ip, r0, #1 + sub r0, r0, #2 + vld1.32 {d16[]}, [r0], r1 + vld1.32 {d17[]}, [r0], r1 + vld1.32 {d18[]}, [r0], r1 + vld1.32 {d19[]}, [r0], r1 + vld1.32 {d16[1]}, [r0], r1 + vld1.32 {d17[1]}, [r0], r1 + vld1.32 {d18[1]}, [r0], r1 + vld1.32 {d19[1]}, [r0], r1 + ldrb r2, [r2, #129*4] + + vtrn.8 d16, d17 + vtrn.8 d18, d19 + vtrn.16 d16, d18 + vtrn.16 d17, d19 + + vp3_loop_filter + + vtrn.8 d0, d1 + + vst1.16 {d0[0]}, [ip], r1 + vst1.16 {d1[0]}, [ip], r1 + vst1.16 {d0[1]}, [ip], r1 + vst1.16 {d1[1]}, [ip], r1 + vst1.16 {d0[2]}, [ip], r1 + vst1.16 {d1[2]}, [ip], r1 + vst1.16 {d0[3]}, [ip], r1 + vst1.16 {d1[3]}, [ip], r1 + bx lr +.endfunc diff -Nurd ffmpeg.old/libavcodec/Makefile ffmpeg-0.5/libavcodec/Makefile --- ffmpeg.old/libavcodec/Makefile 2009-02-26 03:29:24.000000000 +0100 +++ ffmpeg-0.5/libavcodec/Makefile 2009-05-30 11:29:51.000000000 +0200 @@ -477,11 +477,15 @@ OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \ arm/mpegvideo_iwmmxt.o \ +NEON-OBJS-$(CONFIG_THEORA_DECODER) += arm/vp3dsp_neon.o +NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o + OBJS-$(HAVE_NEON) += arm/dsputil_neon.o \ arm/dsputil_neon_s.o \ arm/h264dsp_neon.o \ arm/h264idct_neon.o \ arm/simple_idct_neon.o \ + $(NEON-OBJS-yes) OBJS-$(ARCH_BFIN) += bfin/dsputil_bfin.o \ bfin/fdct_bfin.o \