Add a NEON version of oc_enc_quantize().
authorTim Terriberry <tterribe@xiph.org>
Mon, 13 Dec 2010 06:27:49 +0000 (06:27 +0000)
committerTim Terriberry <tterribe@xiph.org>
Mon, 13 Dec 2010 06:27:49 +0000 (06:27 +0000)
svn path=/trunk/theora/; revision=17749

lib/Makefile.am
lib/arm/armenc.c [new file with mode: 0644]
lib/arm/armenc.h [new file with mode: 0644]
lib/arm/armenquant.s [new file with mode: 0644]
lib/encint.h

index 1844621..6344519 100644 (file)
@@ -3,14 +3,16 @@ AM_CFLAGS = $(OGG_CFLAGS) $(CAIRO_CFLAGS)
 
 EXTRA_DIST = \
        encoder_disabled.c \
+       arm/arm2gnu.pl \
+       arm/armopts.s.in \
        arm/armcpu.c \
        arm/armbits.h \
        arm/armbits.s \
        arm/armfrag.s \
        arm/armidct.s \
        arm/armint.h \
-       arm/armopts.s.in \
-       arm/arm2gnu.pl \
+       arm/armenc.h \
+       arm/armenquant.s \
        c64x/c64xint.h \
        c64x/c64xdec.h \
        x86/mmxfrag.c \
@@ -61,7 +63,9 @@ encoder_shared_x86_sources = \
 
 encoder_shared_x86_64_sources =
 
-encoder_uniq_arm_sources =
+encoder_uniq_arm_sources = \
+       armenquant-gnu.S \
+       arm/armenc.c
 
 if CPU_arm
 BUILT_SOURCES = \
@@ -268,12 +272,14 @@ CLEANFILES = \
        armfrag-gnu.S \
        armidct-gnu.S \
        armloop-gnu.S \
-       armopts-gnu.S
+       armopts-gnu.S \
+       armenquant-gnu.S
 
 # automake doesn't do dependency tracking for asm files, that I can tell
 armfrag-gnu.S: armopts-gnu.S
 armidct-gnu.S: armopts-gnu.S
 armloop-gnu.S: armopts-gnu.S
+armenquant-gnu.S: armopts-gnu.S
 
 # convert ARM asm to GNU as format
 %-gnu.S: $(srcdir)/arm/%.s
diff --git a/lib/arm/armenc.c b/lib/arm/armenc.c
new file mode 100644 (file)
index 0000000..3e66334
--- /dev/null
@@ -0,0 +1,54 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#include "armenc.h"
+
+#if defined(OC_ARM_ASM)
+
+void oc_enc_accel_init_arm(oc_enc_ctx *_enc){
+  ogg_uint32_t cpu_flags;
+  cpu_flags=_enc->state.cpu_flags;
+  oc_enc_accel_init_c(_enc);
+# if defined(OC_ENC_USE_VTABLE)
+  /*TODO: Add ARMv4 functions here.*/
+# endif
+# if defined(OC_ARM_ASM_EDSP)
+  if(cpu_flags&OC_CPU_ARM_EDSP){
+#  if defined(OC_STATE_USE_VTABLE)
+    /*TODO: Add EDSP functions here.*/
+#  endif
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  if(cpu_flags&OC_CPU_ARM_MEDIA){
+#   if defined(OC_STATE_USE_VTABLE)
+    /*TODO: Add Media functions here.*/
+#   endif
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  if(cpu_flags&OC_CPU_ARM_NEON){
+#    if defined(OC_STATE_USE_VTABLE)
+    _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_neon;
+    _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_neon;
+    _enc->opt_vtable.quantize=oc_enc_quantize_neon;
+#    endif
+    _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
+    _enc->opt_data.enquant_table_alignment=16;
+  }
+#   endif
+#  endif
+# endif
+}
+#endif
diff --git a/lib/arm/armenc.h b/lib/arm/armenc.h
new file mode 100644 (file)
index 0000000..54bdaae
--- /dev/null
@@ -0,0 +1,44 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armenc_H)
+# define _arm_armenc_H (1)
+# include "armint.h"
+
+# if defined(OC_ARM_ASM)
+#  define oc_enc_accel_init oc_enc_accel_init_arm
+#  define OC_ENC_USE_VTABLE (1)
+# endif
+
+# include "../encint.h"
+
+# if defined(OC_ARM_ASM)
+void oc_enc_accel_init_arm(oc_enc_ctx *_enc);
+
+#  if defined(OC_ARM_ASM_EDSP)
+#   if defined(OC_ARM_ASM_MEDIA)
+#    if defined(OC_ARM_ASM_NEON)
+void oc_enc_enquant_table_init_neon(void *_enquant,
+ const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_neon(void *_enquant[3][3][2],int _nqis);
+int oc_enc_quantize_neon(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant);
+#    endif
+#   endif
+#  endif
+# endif
+
+#endif
diff --git a/lib/arm/armenquant.s b/lib/arm/armenquant.s
new file mode 100644 (file)
index 0000000..09fb2bd
--- /dev/null
@@ -0,0 +1,164 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+;
+; function:
+;   last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
+;
+;********************************************************************
+
+       AREA    |.text|, CODE, READONLY
+
+       GET     armopts.s
+
+ [ OC_ARM_ASM_NEON
+       EXPORT  oc_enc_enquant_table_init_neon
+       EXPORT  oc_enc_enquant_table_fixup_neon
+       EXPORT  oc_enc_quantize_neon
+
+oc_enc_enquant_table_init_neon PROC
+       ; r0 = void               *_enquant
+       ; r1 = const ogg_uint16_t  _dequant[64]
+       STMFD r13!,{r0,r14}
+       ; Initialize the table using the C routine
+       BLX     oc_enc_enquant_table_init_c
+       LDR     r0, [r13],#4
+       MOV     r1, #2
+       ; Now partially de-interleave it, so that the first row is all
+       ;  multipliers, the second row is all shift factors, etc.
+       ; Also, negate the shifts for use by VSHL.
+oeeti_neon_lp
+       SUBS    r1, r1, #1
+       VLDMIA          r0, {D16-D31}
+       VUZP.16         Q8, Q9
+       VNEG.S16        Q9, Q9
+       VUZP.16         Q10,Q11
+       VNEG.S16        Q11,Q11
+       VUZP.16         Q12,Q13
+       VNEG.S16        Q13,Q13
+       VUZP.16         Q14,Q15
+       VNEG.S16        Q15,Q15
+       VSTMIA          r0!,{D16-D31}
+       BNE     oeeti_neon_lp
+       LDR     PC, [r13],#4
+       ENDP
+
+oc_enc_enquant_table_fixup_neon PROC
+       ; r0 = void *_enquant[3][3][2]
+       ; r1 = int   _nqis
+       STR     r14, [r13,#-4]!
+oeetf_neon_lp1
+       SUBS    r1, r1, #1
+       BEQ     oeetf_neon_end1
+       MOV     r14,#3
+oeetf_neon_lp2
+       LDR     r2, [r0]
+       SUBS    r14,r14,#1
+       LDRH    r3, [r2]
+       LDRH    r12,[r2,#32]
+       LDR     r2, [r0,#8]
+       STRH    r3, [r2]
+       STRH    r12,[r2,#32]
+       LDR     r2, [r0,#4]
+       LDRH    r3, [r2]
+       LDRH    r12,[r2,#32]
+       LDR     r2, [r0,#12]
+       ADD     r0, r0, #24
+       STRH    r3, [r2]
+       STRH    r12,[r2,#32]
+       BNE     oeetf_neon_lp2
+       SUB     r0, r0, #64
+       B       oeetf_neon_lp1
+oeetf_neon_end1
+       LDR     PC, [r13],#4
+       ENDP
+
+oc_enc_quantize_neon PROC
+       ; r0 = ogg_int16_t        _qdct[64]
+       ; r1 = const ogg_int16_t  _dct[64]
+       ; r2 = const ogg_int16_t  _dequant[64]
+       ; r3 = const void        *_enquant
+       STMFD   r13!,{r4,r5,r14}
+       ; The loop counter goes in the high half of r14
+       MOV     r14,#0xFFFCFFFF
+oeq_neon_lp
+       ; Load the next two rows of the data and the quant matrices.
+       VLD1.64         {D16,D17,D18,D19},[r1@128]!
+       VLD1.64         {D20,D21,D22,D23},[r2@128]!
+       ; Add in the signed rounding bias from the quantizers.
+       ; Note that the VHADD relies on the fact that the quantizers are all
+       ;  even (they're in fact multiples of four) in order to round correctly
+       ;  on the entries being negated.
+       VSHR.S16        Q0, Q8, #15
+       VSHR.S16        Q1, Q9, #15
+       VLD1.64         {D24,D25,D26,D27},[r3@128]!
+       VHADD.S16       Q10,Q0, Q10
+       VHADD.S16       Q11,Q1, Q11
+       VLD1.64         {D28,D29,D30,D31},[r3@128]!
+       ADDS    r14,r14,#1<<16
+       VEOR.S16        Q10,Q0, Q10
+       VEOR.S16        Q11,Q1, Q11
+       VADD.S16        Q8, Q8, Q10
+       VADD.S16        Q9, Q9, Q11
+       ; Perform the actual division and save the result.
+       VQDMULH.S16     Q12,Q8, Q12
+       VQDMULH.S16     Q14,Q9, Q14
+       VADD.S16        Q8, Q8, Q8
+       VADD.S16        Q9, Q9, Q9
+       VADD.S16        Q8, Q8, Q12
+       VADD.S16        Q9, Q9, Q14
+       VSHL.S16        Q8, Q13
+       VSHL.S16        Q9, Q15
+       VSUB.S16        Q8, Q8, Q0
+       VSUB.S16        Q9, Q9, Q1
+       VST1.64         {D16,D17,D18,D19},[r0@128]!
+       ; Now pull out a bitfield marking the non-zero coefficients.
+       ; Sadly, NEON has no PMOVMSKB; emulating it requires 7 instructions.
+       VQMOVN.S16      D16,Q8
+       VQMOVN.S16      D17,Q9
+       VCEQ.S8         Q8, #0
+       VNEG.S8         Q8, Q8          ; D16=.......3.......2.......1.......0
+                                       ;     .......7.......6.......5.......4
+                                       ; D17=.......B.......A.......9.......8
+                                       ;     .......F.......E.......D.......C
+       VZIP.8          D16,D17         ; D16=.......9.......1.......8.......0
+                                       ;     .......B.......3.......A.......2
+                                       ; D17=.......D.......5.......C.......4
+                                       ;     .......F.......7.......E.......6
+       VSHL.U8         D17,D17,#4      ; D17=...D.......5.......C.......4....
+                                       ;     ...F.......7.......E.......6....
+       VORR            D16,D16,D17     ; D16=...D...9...5...1...C...8...4...0
+                                       ;     ...F...B...7...3...E...A...6...2
+       ; Shift over the bitfields from previous iterations and
+       ;  finish compacting the bitfield from the last iteration.
+       ORR     r4, r5, LSL #2          ; r4 =.F.D.B.9.7.5.3.1.E.C.A.8.6.4.2.0
+       ORR     r4, r4, LSR #15         ; r4 =.F.D.B.9.7.5.3.1FEDCBA9876543210
+       PKHTB   r14,r14,r12,ASR #16     ; r14=i|A
+       PKHBT   r12,r4, r12,LSL #16     ; r12=B|C
+       VMOV            r4, r5, D16
+       BLT     oeq_neon_lp
+       ; Start with the low half while the NEON register transfers.
+       PKHBT   r0, r14,r12             ; r0 =B|A
+       MVN     r0, r0
+       CLZ     r0, r0
+       RSB     r0, r0, #31
+       ; Stall 8-10 more cycles waiting for the last transfer.
+       ORR     r4, r5, LSL #2          ; r4 =.F.D.B.9.7.5.3.1.E.C.A.8.6.4.2.0
+       ORR     r4, r4, LSR #15         ; r4 =.F.D.B.9.7.5.3.1FEDCBA9876543210
+       PKHBT   r1, r12,r4, LSL #16     ; r1 = D|C
+       MVNS    r1, r1
+       CLZNE   r1, r1
+       RSBNE   r0, r1, #63
+       LDMFD   r13!,{r4,r5,PC}
+       ENDP
+ ]
+
+       END
index e702c6b..1bde47d 100644 (file)
@@ -51,6 +51,9 @@ typedef struct oc_token_checkpoint    oc_token_checkpoint;
 #   include "x86/x86enc.h"
 #  endif
 # endif
+# if defined(OC_ARM_ASM)
+#  include "arm/armenc.h"
+# endif
 
 # if !defined(oc_enc_accel_init)
 #  define oc_enc_accel_init oc_enc_accel_init_c