6b17b8c23afc80f3f299046f50e03914d47d3974
[theora.git] / lib / arm / armfrag.s
1 ;********************************************************************
2 ;*                                                                  *
3 ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4 ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5 ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7 ;*                                                                  *
8 ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
9 ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10 ;*                                                                  *
11 ;********************************************************************
12 ; Original implementation:
13 ;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
14 ; last mod: $Id$
15 ;********************************************************************
16
17         AREA    |.text|, CODE, READONLY
18
19         GET     armopts.s
20
21 ; Vanilla ARM v4 versions
22         EXPORT  oc_frag_copy_list_arm
23         EXPORT  oc_frag_recon_intra_arm
24         EXPORT  oc_frag_recon_inter_arm
25         EXPORT  oc_frag_recon_inter2_arm
26
27 oc_frag_copy_list_arm
28         ; r0 = _dst_frame
29         ; r1 = _src_frame
30         ; r2 = _ystride
31         ; r3 = _fragis
32         ; <> = _nfragis
33         ; <> = _frag_buf_offs
34         LDR     r12,[r13]               ; r12 = _nfragis
35         STMFD   r13!,{r4-r6,r11,r14}
36         SUBS    r12, r12, #1
37         LDR     r4,[r3],#4              ; r4 = _fragis[fragii]
38         LDRGE   r14,[r13,#4*6]          ; r14 = _frag_buf_offs
39         BLT     ofcl_arm_end
40         SUB     r2, r2, #4
41 ofcl_arm_lp
42         LDR     r11,[r14,r4,LSL #2]     ; r11 = _frag_buf_offs[_fragis[fragii]]
43         SUBS    r12, r12, #1
44         ; Stall (on XScale)
45         ADD     r4, r1, r11             ; r4 = _src_frame+frag_buf_off
46         LDR     r6, [r4], #4
47         ADD     r11,r0, r11             ; r11 = _dst_frame+frag_buf_off
48         LDR     r5, [r4], r2
49         STR     r6, [r11],#4
50         LDR     r6, [r4], #4
51         STR     r5, [r11],r2
52         LDR     r5, [r4], r2
53         STR     r6, [r11],#4
54         LDR     r6, [r4], #4
55         STR     r5, [r11],r2
56         LDR     r5, [r4], r2
57         STR     r6, [r11],#4
58         LDR     r6, [r4], #4
59         STR     r5, [r11],r2
60         LDR     r5, [r4], r2
61         STR     r6, [r11],#4
62         LDR     r6, [r4], #4
63         STR     r5, [r11],r2
64         LDR     r5, [r4], r2
65         STR     r6, [r11],#4
66         LDR     r6, [r4], #4
67         STR     r5, [r11],r2
68         LDR     r5, [r4], r2
69         STR     r6, [r11],#4
70         LDR     r6, [r4], #4
71         STR     r5, [r11],r2
72         LDR     r5, [r4], r2
73         STR     r6, [r11],#4
74         LDR     r6, [r4], #4
75         STR     r5, [r11],r2
76         LDR     r5, [r4]
77         LDRGE   r4,[r3],#4              ; r4 = _fragis[fragii]
78         STR     r6, [r11],#4
79         STR     r5, [r11]
80         BGE     ofcl_arm_lp
81 ofcl_arm_end
82         LDMFD   r13!,{r4-r6,r11,PC}
83 oc_frag_recon_intra_arm
84         ; r0 =       unsigned char *_dst
85         ; r1 =       int            _ystride
86         ; r2 = const ogg_int16_t    _residue[64]
87         STMFD   r13!,{r4,r5,r14}
88         MOV     r14,#8
89         MOV     r5, #255
90         SUB     r1, r1, #7
91 ofrintra_lp_arm
92         LDRSH   r3, [r2], #2
93         LDRSH   r4, [r2], #2
94         LDRSH   r12,[r2], #2
95         ADDS    r3, r3, #128
96         CMPGT   r5, r3
97         EORLT   r3, r5, r3, ASR #32
98         STRB    r3, [r0], #1
99         ADDS    r4, r4, #128
100         CMPGT   r5, r4
101         EORLT   r4, r5, r4, ASR #32
102         LDRSH   r3, [r2], #2
103         STRB    r4, [r0], #1
104         ADDS    r12,r12,#128
105         CMPGT   r5, r12
106         EORLT   r12,r5, r12,ASR #32
107         LDRSH   r4, [r2], #2
108         STRB    r12,[r0], #1
109         ADDS    r3, r3, #128
110         CMPGT   r5, r3
111         EORLT   r3, r5, r3, ASR #32
112         LDRSH   r12,[r2], #2
113         STRB    r3, [r0], #1
114         ADDS    r4, r4, #128
115         CMPGT   r5, r4
116         EORLT   r4, r5, r4, ASR #32
117         LDRSH   r3, [r2], #2
118         STRB    r4, [r0], #1
119         ADDS    r12,r12,#128
120         CMPGT   r5, r12
121         EORLT   r12,r5, r12,ASR #32
122         LDRSH   r4, [r2], #2
123         STRB    r12,[r0], #1
124         ADDS    r3, r3, #128
125         CMPGT   r5, r3
126         EORLT   r3, r5, r3, ASR #32
127         STRB    r3, [r0], #1
128         ADDS    r4, r4, #128
129         CMPGT   r5, r4
130         EORLT   r4, r5, r4, ASR #32
131         STRB    r4, [r0], r1
132         SUBS    r14,r14,#1
133         BGT     ofrintra_lp_arm
134         LDMFD   r13!,{r4,r5,PC}
135
136 oc_frag_recon_inter_arm
137         ; r0 =       unsigned char *dst
138         ; r1 = const unsigned char *src
139         ; r2 =       int            ystride
140         ; r3 = const ogg_int16_t    residue[64]
141         STMFD   r13!,{r5,r9-r11,r14}
142         MOV     r9, #8
143         MOV     r5, #255
144         SUB     r2, r2, #7
145 ofrinter_lp_arm
146         LDRSH   r12,[r3], #2
147         LDRB    r14,[r1], #1
148         LDRSH   r11,[r3], #2
149         LDRB    r10,[r1], #1
150         ADDS    r12,r12,r14
151         CMPGT   r5, r12
152         EORLT   r12,r5, r12,ASR #32
153         STRB    r12,[r0], #1
154         ADDS    r11,r11,r10
155         CMPGT   r5, r11
156         LDRSH   r12,[r3], #2
157         LDRB    r14,[r1], #1
158         EORLT   r11,r5, r11,ASR #32
159         STRB    r11,[r0], #1
160         ADDS    r12,r12,r14
161         CMPGT   r5, r12
162         LDRSH   r11,[r3], #2
163         LDRB    r10,[r1], #1
164         EORLT   r12,r5, r12,ASR #32
165         STRB    r12,[r0], #1
166         ADDS    r11,r11,r10
167         CMPGT   r5, r11
168         LDRSH   r12,[r3], #2
169         LDRB    r14,[r1], #1
170         EORLT   r11,r5, r11,ASR #32
171         STRB    r11,[r0], #1
172         ADDS    r12,r12,r14
173         CMPGT   r5, r12
174         LDRSH   r11,[r3], #2
175         LDRB    r10,[r1], #1
176         EORLT   r12,r5, r12,ASR #32
177         STRB    r12,[r0], #1
178         ADDS    r11,r11,r10
179         CMPGT   r5, r11
180         LDRSH   r12,[r3], #2
181         LDRB    r14,[r1], #1
182         EORLT   r11,r5, r11,ASR #32
183         STRB    r11,[r0], #1
184         ADDS    r12,r12,r14
185         CMPGT   r5, r12
186         LDRSH   r11,[r3], #2
187         LDRB    r10,[r1], r2
188         EORLT   r12,r5, r12,ASR #32
189         STRB    r12,[r0], #1
190         ADDS    r11,r11,r10
191         CMPGT   r5, r11
192         EORLT   r11,r5, r11,ASR #32
193         STRB    r11,[r0], r2
194         SUBS    r9, r9, #1
195         BGT     ofrinter_lp_arm
196         LDMFD   r13!,{r5,r9-r11,PC}
197
198 oc_frag_recon_inter2_arm
199         ; r0 =       unsigned char *dst
200         ; r1 = const unsigned char *src1
201         ; r2 = const unsigned char *src2
202         ; r3 =       int            ystride
203         LDR     r12,[r13]
204         ; r12= const ogg_int16_t    residue[64]
205         STMFD   r13!,{r4-r8,r14}
206         MOV     r14,#8
207         MOV     r8, #255
208         SUB     r3, r3, #7
209 ofrinter2_lp_arm
210         LDRB    r5, [r1], #1
211         LDRB    r6, [r2], #1
212         LDRSH   r4, [r12],#2
213         LDRB    r7, [r1], #1
214         ADD     r5, r5, r6
215         ADDS    r5, r4, r5, LSR #1
216         CMPGT   r8, r5
217         LDRB    r6, [r2], #1
218         LDRSH   r4, [r12],#2
219         EORLT   r5, r8, r5, ASR #32
220         STRB    r5, [r0], #1
221         ADD     r7, r7, r6
222         ADDS    r7, r4, r7, LSR #1
223         CMPGT   r8, r7
224         LDRB    r5, [r1], #1
225         LDRB    r6, [r2], #1
226         LDRSH   r4, [r12],#2
227         EORLT   r7, r8, r7, ASR #32
228         STRB    r7, [r0], #1
229         ADD     r5, r5, r6
230         ADDS    r5, r4, r5, LSR #1
231         CMPGT   r8, r5
232         LDRB    r7, [r1], #1
233         LDRB    r6, [r2], #1
234         LDRSH   r4, [r12],#2
235         EORLT   r5, r8, r5, ASR #32
236         STRB    r5, [r0], #1
237         ADD     r7, r7, r6
238         ADDS    r7, r4, r7, LSR #1
239         CMPGT   r8, r7
240         LDRB    r5, [r1], #1
241         LDRB    r6, [r2], #1
242         LDRSH   r4, [r12],#2
243         EORLT   r7, r8, r7, ASR #32
244         STRB    r7, [r0], #1
245         ADD     r5, r5, r6
246         ADDS    r5, r4, r5, LSR #1
247         CMPGT   r8, r5
248         LDRB    r7, [r1], #1
249         LDRB    r6, [r2], #1
250         LDRSH   r4, [r12],#2
251         EORLT   r5, r8, r5, ASR #32
252         STRB    r5, [r0], #1
253         ADD     r7, r7, r6
254         ADDS    r7, r4, r7, LSR #1
255         CMPGT   r8, r7
256         LDRB    r5, [r1], #1
257         LDRB    r6, [r2], #1
258         LDRSH   r4, [r12],#2
259         EORLT   r7, r8, r7, ASR #32
260         STRB    r7, [r0], #1
261         ADD     r5, r5, r6
262         ADDS    r5, r4, r5, LSR #1
263         CMPGT   r8, r5
264         LDRB    r7, [r1], r3
265         LDRB    r6, [r2], r3
266         LDRSH   r4, [r12],#2
267         EORLT   r5, r8, r5, ASR #32
268         STRB    r5, [r0], #1
269         ADD     r7, r7, r6
270         ADDS    r7, r4, r7, LSR #1
271         CMPGT   r8, r7
272         EORLT   r7, r8, r7, ASR #32
273         STRB    r7, [r0], r3
274         SUBS    r14,r14,#1
275         BGT     ofrinter2_lp_arm
276         LDMFD   r13!,{r4-r8,PC}
277
278  [ OC_ARM_ASM_EDSP
279         EXPORT  oc_frag_copy_list_edsp
280
281 oc_frag_copy_list_edsp
282         ; r0 = _dst_frame
283         ; r1 = _src_frame
284         ; r2 = _ystride
285         ; r3 = _fragis
286         ; <> = _nfragis
287         ; <> = _frag_buf_offs
288         LDR     r12,[r13]               ; r12 = _nfragis
289         STMFD   r13!,{r4-r11,r14}
290         SUBS    r12, r12, #1
291         LDRGE   r5, [r3],#4             ; r5 = _fragis[fragii]
292         LDRGE   r14,[r13,#4*10]         ; r14 = _frag_buf_offs
293         BLT     ofcl_edsp_end
294 ofcl_edsp_lp
295         MOV     r4, r1
296         LDR     r5, [r14,r5, LSL #2]    ; r5 = _frag_buf_offs[_fragis[fragii]]
297         SUBS    r12, r12, #1
298         ; Stall (on XScale)
299         LDRD    r6, [r4, r5]!           ; r4 = _src_frame+frag_buf_off
300         LDRD    r8, [r4, r2]!
301         ; Stall
302         STRD    r6, [r5, r0]!           ; r5 = _dst_frame+frag_buf_off
303         STRD    r8, [r5, r2]!
304         ; Stall
305         LDRD    r6, [r4, r2]!   ; On Xscale at least, doing 3 consecutive
306         LDRD    r8, [r4, r2]!   ; loads causes a stall, but that's no worse
307         LDRD    r10,[r4, r2]!   ; than us only doing 2, and having to do
308                                 ; another pair of LDRD/STRD later on.
309         ; Stall
310         STRD    r6, [r5, r2]!
311         STRD    r8, [r5, r2]!
312         STRD    r10,[r5, r2]!
313         LDRD    r6, [r4, r2]!
314         LDRD    r8, [r4, r2]!
315         LDRD    r10,[r4, r2]!
316         STRD    r6, [r5, r2]!
317         STRD    r8, [r5, r2]!
318         STRD    r10,[r5, r2]!
319         LDRGE   r5, [r3],#4             ; r5 = _fragis[fragii]
320         BGE     ofcl_edsp_lp
321 ofcl_edsp_end
322         LDMFD   r13!,{r4-r11,PC}
323  ]
324
325  [ OC_ARM_ASM_MEDIA
326         EXPORT  oc_frag_recon_intra_v6
327         EXPORT  oc_frag_recon_inter_v6
328         EXPORT  oc_frag_recon_inter2_v6
329
330 oc_frag_recon_intra_v6
331         ; r0 =       unsigned char *_dst
332         ; r1 =       int            _ystride
333         ; r2 = const ogg_int16_t    _residue[64]
334         STMFD   r13!,{r4-r6,r14}
335         MOV     r14,#8
336         MOV     r12,r2
337         LDR     r6, =0x00800080
338 ofrintra_v6_lp
339         LDRD    r2, [r12],#8    ; r2 = 11110000 r3 = 33332222
340         LDRD    r4, [r12],#8    ; r4 = 55554444 r5 = 77776666
341         SUBS    r14,r14,#1
342         QADD16  r2, r2, r6
343         QADD16  r3, r3, r6
344         QADD16  r4, r4, r6
345         QADD16  r5, r5, r6
346         USAT16  r2, #8, r2              ; r2 = __11__00
347         USAT16  r3, #8, r3              ; r3 = __33__22
348         USAT16  r4, #8, r4              ; r4 = __55__44
349         USAT16  r5, #8, r5              ; r5 = __77__66
350         ORR     r2, r2, r2, LSR #8      ; r2 = __111100
351         ORR     r3, r3, r3, LSR #8      ; r3 = __333322
352         ORR     r4, r4, r4, LSR #8      ; r4 = __555544
353         ORR     r5, r5, r5, LSR #8      ; r5 = __777766
354         PKHBT   r2, r2, r3, LSL #16     ; r2 = 33221100
355         PKHBT   r3, r4, r5, LSL #16     ; r3 = 77665544
356         STRD    r2, [r0], r1
357         BGT     ofrintra_v6_lp
358         LDMFD   r13!,{r4-r6,PC}
359
360 oc_frag_recon_inter_v6
361         ; r0 =       unsigned char *_dst
362         ; r1 = const unsigned char *_src
363         ; r2 =       int            _ystride
364         ; r3 = const ogg_int16_t    _residue[64]
365         STMFD   r13!,{r4-r7,r14}
366         MOV     r14,#8
367 ofrinter_v6_lp
368         LDRD    r6, [r3], #8            ; r6 = 11110000 r7 = 33332222
369         SUBS    r14,r14,#1
370  [ OC_ARM_CAN_UNALIGN_LDRD
371         LDRD    r4, [r1], r2    ; Unaligned ; r4 = 33221100 r5 = 77665544
372  |
373         LDR     r5, [r1, #4]
374         LDR     r4, [r1], r2
375  ]
376         PKHBT   r12,r6, r7, LSL #16     ; r12= 22220000
377         PKHTB   r7, r7, r6, ASR #16     ; r7 = 33331111
378         UXTB16  r6,r4                   ; r6 = __22__00
379         UXTB16  r4,r4, ROR #8           ; r4 = __33__11
380         QADD16  r12,r12,r6              ; r12= xx22xx00
381         QADD16  r4, r7, r4              ; r4 = xx33xx11
382         LDRD    r6, [r3], #8            ; r6 = 55554444 r7 = 77776666
383         USAT16  r4, #8, r4              ; r4 = __33__11
384         USAT16  r12,#8,r12              ; r12= __22__00
385         ORR     r4, r12,r4, LSL #8      ; r4 = 33221100
386         PKHBT   r12,r6, r7, LSL #16     ; r12= 66664444
387         PKHTB   r7, r7, r6, ASR #16     ; r7 = 77775555
388         UXTB16  r6,r5                   ; r6 = __66__44
389         UXTB16  r5,r5, ROR #8           ; r5 = __77__55
390         QADD16  r12,r12,r6              ; r12= xx66xx44
391         QADD16  r5, r7, r5              ; r5 = xx77xx55
392         USAT16  r12,#8, r12             ; r12= __66__44
393         USAT16  r5, #8, r5              ; r4 = __77__55
394         ORR     r5, r12,r5, LSL #8      ; r5 = 33221100
395         STRD    r4, [r0], r2
396         BGT     ofrinter_v6_lp
397         LDMFD   r13!,{r4-r7,PC}
398
399 oc_frag_recon_inter2_v6
400         ; r0 =       unsigned char *_dst
401         ; r1 = const unsigned char *_src1
402         ; r2 = const unsigned char *_src2
403         ; r3 =       int            _ystride
404         LDR     r12,[r13]
405         ; r12= const ogg_int16_t    _residue[64]
406         STMFD   r13!,{r4-r9,r14}
407         MOV     r14,#8
408 ofrinter2_v6_lp
409         LDRD    r6, [r12,#8]    ; r6 = 55554444 r7 = 77776666
410         SUBS    r14,r14,#1
411         LDR     r4, [r1, #4]    ; Unaligned     ; r4 = src1[1] = 77665544
412         LDR     r5, [r2, #4]    ; Unaligned     ; r5 = src2[1] = 77665544
413         PKHBT   r8, r6, r7, LSL #16     ; r8 = 66664444
414         PKHTB   r9, r7, r6, ASR #16     ; r9 = 77775555
415         UHADD8  r4, r4, r5      ; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
416         UXTB16  r5, r4                  ; r5 = __66__44
417         UXTB16  r4, r4, ROR #8          ; r4 = __77__55
418         QADD16  r8, r8, r5              ; r8 = xx66xx44
419         QADD16  r9, r9, r4              ; r9 = xx77xx55
420         LDRD    r6,[r12],#16    ; r6 = 33332222 r7 = 11110000
421         USAT16  r8, #8, r8              ; r8 = __66__44
422         LDR     r4, [r1], r3    ; Unaligned     ; r4 = src1[0] = 33221100
423         USAT16  r9, #8, r9              ; r9 = __77__55
424         LDR     r5, [r2], r3    ; Unaligned     ; r5 = src2[0] = 33221100
425         ORR     r9, r8, r9, LSL #8      ; r9 = 77665544
426         PKHBT   r8, r6, r7, LSL #16     ; r8 = 22220000
427         UHADD8  r4, r4, r5      ; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
428         PKHTB   r7, r7, r6, ASR #16     ; r7 = 33331111
429         UXTB16  r5, r4                  ; r5 = __22__00
430         UXTB16  r4, r4, ROR #8          ; r4 = __33__11
431         QADD16  r8, r8, r5              ; r8 = xx22xx00
432         QADD16  r7, r7, r4              ; r7 = xx33xx11
433         USAT16  r8, #8, r8              ; r8 = __22__00
434         USAT16  r7, #8, r7              ; r7 = __33__11
435         ORR     r8, r8, r7, LSL #8      ; r8 = 33221100
436         STRD    r8, [r0], r3
437         BGT     ofrinter2_v6_lp
438         LDMFD   r13!,{r4-r9,PC}
439  ]
440
441  [ OC_ARM_ASM_NEON
442         EXPORT  oc_frag_copy_list_neon
443         EXPORT  oc_frag_recon_intra_neon
444         EXPORT  oc_frag_recon_inter_neon
445         EXPORT  oc_frag_recon_inter2_neon
446
447 oc_frag_copy_list_neon
448         ; r0 = _dst_frame
449         ; r1 = _src_frame
450         ; r2 = _ystride
451         ; r3 = _fragis
452         ; <> = _nfragis
453         ; <> = _frag_buf_offs
454         LDR     r12,[r13]               ; r12 = _nfragis
455         STMFD   r13!,{r4-r7,r14}
456         CMP     r12, #1
457         LDRGE   r6, [r3]                ; r6 = _fragis[fragii]
458         LDRGE   r14,[r13,#4*6]          ; r14 = _frag_buf_offs
459         BLT     ofcl_neon_end
460         ; Stall (2 on Xscale)
461         LDR     r6, [r14,r6, LSL #2]    ; r6 = _frag_buf_offs[_fragis[fragii]]
462         ; Stall (on XScale)
463         MOV     r7, r6                  ; Guarantee PLD points somewhere valid.
464 ofcl_neon_lp
465         ADD     r4, r1, r6
466         VLD1.64 {D0}, [r4@64], r2
467         ADD     r5, r0, r6
468         VLD1.64 {D1}, [r4@64], r2
469         SUBS    r12, r12, #1
470         VLD1.64 {D2}, [r4@64], r2
471         LDRGT   r6, [r3,#4]!            ; r6 = _fragis[fragii]
472         VLD1.64 {D3}, [r4@64], r2
473         LDRGT   r6, [r14,r6, LSL #2]    ; r6 = _frag_buf_offs[_fragis[fragii]]
474         VLD1.64 {D4}, [r4@64], r2
475         ADDGT   r7, r1, r6
476         VLD1.64 {D5}, [r4@64], r2
477         PLD     [r7]
478         VLD1.64 {D6}, [r4@64], r2
479         PLD     [r7, r2]
480         VLD1.64 {D7}, [r4@64]
481         PLD     [r7, r2, LSL #1]
482         VST1.64 {D0}, [r5@64], r2
483         ADDGT   r7, r7, r2, LSL #2
484         VST1.64 {D1}, [r5@64], r2
485         PLD     [r7, -r2]
486         VST1.64 {D2}, [r5@64], r2
487         PLD     [r7]
488         VST1.64 {D3}, [r5@64], r2
489         PLD     [r7, r2]
490         VST1.64 {D4}, [r5@64], r2
491         PLD     [r7, r2, LSL #1]
492         VST1.64 {D5}, [r5@64], r2
493         ADDGT   r7, r7, r2, LSL #2
494         VST1.64 {D6}, [r5@64], r2
495         PLD     [r7, -r2]
496         VST1.64 {D7}, [r5@64]
497         BGT     ofcl_neon_lp
498 ofcl_neon_end
499         LDMFD   r13!,{r4-r7,PC}
500
501 oc_frag_recon_intra_neon
502         ; r0 =       unsigned char *_dst
503         ; r1 =       int            _ystride
504         ; r2 = const ogg_int16_t    _residue[64]
505         MOV     r3, #128
506         VDUP.S16        Q0, r3
507         VLDMIA  r2,  {D16-D31}  ; D16= 3333222211110000 etc     ; 9(8) cycles
508         VQADD.S16       Q8, Q8, Q0
509         VQADD.S16       Q9, Q9, Q0
510         VQADD.S16       Q10,Q10,Q0
511         VQADD.S16       Q11,Q11,Q0
512         VQADD.S16       Q12,Q12,Q0
513         VQADD.S16       Q13,Q13,Q0
514         VQADD.S16       Q14,Q14,Q0
515         VQADD.S16       Q15,Q15,Q0
516         VQMOVUN.S16     D16,Q8  ; D16= 7766554433221100         ; 1 cycle
517         VQMOVUN.S16     D17,Q9  ; D17= FFEEDDCCBBAA9988         ; 1 cycle
518         VQMOVUN.S16     D18,Q10 ; D18= NNMMLLKKJJIIHHGG         ; 1 cycle
519         VST1.64 {D16},[r0@64], r1
520         VQMOVUN.S16     D19,Q11 ; D19= VVUUTTSSRRQQPPOO         ; 1 cycle
521         VST1.64 {D17},[r0@64], r1
522         VQMOVUN.S16     D20,Q12 ; D20= ddccbbaaZZYYXXWW         ; 1 cycle
523         VST1.64 {D18},[r0@64], r1
524         VQMOVUN.S16     D21,Q13 ; D21= llkkjjiihhggffee         ; 1 cycle
525         VST1.64 {D19},[r0@64], r1
526         VQMOVUN.S16     D22,Q14 ; D22= ttssrrqqppoonnmm         ; 1 cycle
527         VST1.64 {D20},[r0@64], r1
528         VQMOVUN.S16     D23,Q15 ; D23= !!@@zzyyxxwwvvuu         ; 1 cycle
529         VST1.64 {D21},[r0@64], r1
530         VST1.64 {D22},[r0@64], r1
531         VST1.64 {D23},[r0@64], r1
532         MOV     PC,R14
533
534 oc_frag_recon_inter_neon
535         ; r0 =       unsigned char *_dst
536         ; r1 = const unsigned char *_src
537         ; r2 =       int            _ystride
538         ; r3 = const ogg_int16_t    _residue[64]
539         VLDMIA  r3, {D16-D31}   ; D16= 3333222211110000 etc     ; 9(8) cycles
540         VLD1.64 {D0}, [r1], r2
541         VLD1.64 {D2}, [r1], r2
542         VMOVL.U8        Q0, D0  ; Q0 = __77__66__55__44__33__22__11__00
543         VLD1.64 {D4}, [r1], r2
544         VMOVL.U8        Q1, D2  ; etc
545         VLD1.64 {D6}, [r1], r2
546         VMOVL.U8        Q2, D4
547         VMOVL.U8        Q3, D6
548         VQADD.S16       Q8, Q8, Q0
549         VLD1.64 {D0}, [r1], r2
550         VQADD.S16       Q9, Q9, Q1
551         VLD1.64 {D2}, [r1], r2
552         VQADD.S16       Q10,Q10,Q2
553         VLD1.64 {D4}, [r1], r2
554         VQADD.S16       Q11,Q11,Q3
555         VLD1.64 {D6}, [r1], r2
556         VMOVL.U8        Q0, D0
557         VMOVL.U8        Q1, D2
558         VMOVL.U8        Q2, D4
559         VMOVL.U8        Q3, D6
560         VQADD.S16       Q12,Q12,Q0
561         VQADD.S16       Q13,Q13,Q1
562         VQADD.S16       Q14,Q14,Q2
563         VQADD.S16       Q15,Q15,Q3
564         VQMOVUN.S16     D16,Q8
565         VQMOVUN.S16     D17,Q9
566         VQMOVUN.S16     D18,Q10
567         VST1.64 {D16},[r0@64], r2
568         VQMOVUN.S16     D19,Q11
569         VST1.64 {D17},[r0@64], r2
570         VQMOVUN.S16     D20,Q12
571         VST1.64 {D18},[r0@64], r2
572         VQMOVUN.S16     D21,Q13
573         VST1.64 {D19},[r0@64], r2
574         VQMOVUN.S16     D22,Q14
575         VST1.64 {D20},[r0@64], r2
576         VQMOVUN.S16     D23,Q15
577         VST1.64 {D21},[r0@64], r2
578         VST1.64 {D22},[r0@64], r2
579         VST1.64 {D23},[r0@64], r2
580         MOV     PC,R14
581
582 oc_frag_recon_inter2_neon
583         ; r0 =       unsigned char *_dst
584         ; r1 = const unsigned char *_src1
585         ; r2 = const unsigned char *_src2
586         ; r3 =       int            _ystride
587         LDR     r12,[r13]
588         ; r12= const ogg_int16_t    _residue[64]
589         VLDMIA  r12,{D16-D31}
590         VLD1.64 {D0}, [r1], r3
591         VLD1.64 {D4}, [r2], r3
592         VLD1.64 {D1}, [r1], r3
593         VLD1.64 {D5}, [r2], r3
594         VHADD.U8        Q2, Q0, Q2      ; Q2 = FFEEDDCCBBAA99887766554433221100
595         VLD1.64 {D2}, [r1], r3
596         VLD1.64 {D6}, [r2], r3
597         VMOVL.U8        Q0, D4          ; Q0 = __77__66__55__44__33__22__11__00
598         VLD1.64 {D3}, [r1], r3
599         VMOVL.U8        Q2, D5          ; etc
600         VLD1.64 {D7}, [r2], r3
601         VHADD.U8        Q3, Q1, Q3
602         VQADD.S16       Q8, Q8, Q0
603         VQADD.S16       Q9, Q9, Q2
604         VLD1.64 {D0}, [r1], r3
605         VMOVL.U8        Q1, D6
606         VLD1.64 {D4}, [r2], r3
607         VMOVL.U8        Q3, D7
608         VLD1.64 {D1}, [r1], r3
609         VQADD.S16       Q10,Q10,Q1
610         VLD1.64 {D5}, [r2], r3
611         VQADD.S16       Q11,Q11,Q3
612         VLD1.64 {D2}, [r1], r3
613         VHADD.U8        Q2, Q0, Q2
614         VLD1.64 {D6}, [r2], r3
615         VLD1.64 {D3}, [r1], r3
616         VMOVL.U8        Q0, D4
617         VLD1.64 {D7}, [r2], r3
618         VMOVL.U8        Q2, D5
619         VHADD.U8        Q3, Q1, Q3
620         VQADD.S16       Q12,Q12,Q0
621         VQADD.S16       Q13,Q13,Q2
622         VMOVL.U8        Q1, D6
623         VMOVL.U8        Q3, D7
624         VQADD.S16       Q14,Q14,Q1
625         VQADD.S16       Q15,Q15,Q3
626         VQMOVUN.S16     D16,Q8
627         VQMOVUN.S16     D17,Q9
628         VQMOVUN.S16     D18,Q10
629         VST1.64 {D16},[r0@64], r3
630         VQMOVUN.S16     D19,Q11
631         VST1.64 {D17},[r0@64], r3
632         VQMOVUN.S16     D20,Q12
633         VST1.64 {D18},[r0@64], r3
634         VQMOVUN.S16     D21,Q13
635         VST1.64 {D19},[r0@64], r3
636         VQMOVUN.S16     D22,Q14
637         VST1.64 {D20},[r0@64], r3
638         VQMOVUN.S16     D23,Q15
639         VST1.64 {D21},[r0@64], r3
640         VST1.64 {D22},[r0@64], r3
641         VST1.64 {D23},[r0@64], r3
642         MOV     PC,R14
643  ]
644
645         END