Add PROC/ENDP markings to the ARM asm (currently ignored by the GNU toolchain).
[theora.git] / lib / arm / armfrag.s
1 ;********************************************************************
2 ;*                                                                  *
3 ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4 ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5 ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7 ;*                                                                  *
8 ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
9 ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10 ;*                                                                  *
11 ;********************************************************************
12 ; Original implementation:
13 ;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
14 ; last mod: $Id$
15 ;********************************************************************
16
17         AREA    |.text|, CODE, READONLY
18
19         GET     armopts.s
20
21 ; Vanilla ARM v4 versions
22         EXPORT  oc_frag_copy_list_arm
23         EXPORT  oc_frag_recon_intra_arm
24         EXPORT  oc_frag_recon_inter_arm
25         EXPORT  oc_frag_recon_inter2_arm
26
27 oc_frag_copy_list_arm PROC
28         ; r0 = _dst_frame
29         ; r1 = _src_frame
30         ; r2 = _ystride
31         ; r3 = _fragis
32         ; <> = _nfragis
33         ; <> = _frag_buf_offs
34         LDR     r12,[r13]               ; r12 = _nfragis
35         STMFD   r13!,{r4-r6,r11,r14}
36         SUBS    r12, r12, #1
37         LDR     r4,[r3],#4              ; r4 = _fragis[fragii]
38         LDRGE   r14,[r13,#4*6]          ; r14 = _frag_buf_offs
39         BLT     ofcl_arm_end
40         SUB     r2, r2, #4
41 ofcl_arm_lp
42         LDR     r11,[r14,r4,LSL #2]     ; r11 = _frag_buf_offs[_fragis[fragii]]
43         SUBS    r12, r12, #1
44         ; Stall (on XScale)
45         ADD     r4, r1, r11             ; r4 = _src_frame+frag_buf_off
46         LDR     r6, [r4], #4
47         ADD     r11,r0, r11             ; r11 = _dst_frame+frag_buf_off
48         LDR     r5, [r4], r2
49         STR     r6, [r11],#4
50         LDR     r6, [r4], #4
51         STR     r5, [r11],r2
52         LDR     r5, [r4], r2
53         STR     r6, [r11],#4
54         LDR     r6, [r4], #4
55         STR     r5, [r11],r2
56         LDR     r5, [r4], r2
57         STR     r6, [r11],#4
58         LDR     r6, [r4], #4
59         STR     r5, [r11],r2
60         LDR     r5, [r4], r2
61         STR     r6, [r11],#4
62         LDR     r6, [r4], #4
63         STR     r5, [r11],r2
64         LDR     r5, [r4], r2
65         STR     r6, [r11],#4
66         LDR     r6, [r4], #4
67         STR     r5, [r11],r2
68         LDR     r5, [r4], r2
69         STR     r6, [r11],#4
70         LDR     r6, [r4], #4
71         STR     r5, [r11],r2
72         LDR     r5, [r4], r2
73         STR     r6, [r11],#4
74         LDR     r6, [r4], #4
75         STR     r5, [r11],r2
76         LDR     r5, [r4]
77         LDRGE   r4,[r3],#4              ; r4 = _fragis[fragii]
78         STR     r6, [r11],#4
79         STR     r5, [r11]
80         BGE     ofcl_arm_lp
81 ofcl_arm_end
82         LDMFD   r13!,{r4-r6,r11,PC}
83 oc_frag_recon_intra_arm
84         ; r0 =       unsigned char *_dst
85         ; r1 =       int            _ystride
86         ; r2 = const ogg_int16_t    _residue[64]
87         STMFD   r13!,{r4,r5,r14}
88         MOV     r14,#8
89         MOV     r5, #255
90         SUB     r1, r1, #7
91 ofrintra_lp_arm
92         LDRSH   r3, [r2], #2
93         LDRSH   r4, [r2], #2
94         LDRSH   r12,[r2], #2
95         ADDS    r3, r3, #128
96         CMPGT   r5, r3
97         EORLT   r3, r5, r3, ASR #32
98         STRB    r3, [r0], #1
99         ADDS    r4, r4, #128
100         CMPGT   r5, r4
101         EORLT   r4, r5, r4, ASR #32
102         LDRSH   r3, [r2], #2
103         STRB    r4, [r0], #1
104         ADDS    r12,r12,#128
105         CMPGT   r5, r12
106         EORLT   r12,r5, r12,ASR #32
107         LDRSH   r4, [r2], #2
108         STRB    r12,[r0], #1
109         ADDS    r3, r3, #128
110         CMPGT   r5, r3
111         EORLT   r3, r5, r3, ASR #32
112         LDRSH   r12,[r2], #2
113         STRB    r3, [r0], #1
114         ADDS    r4, r4, #128
115         CMPGT   r5, r4
116         EORLT   r4, r5, r4, ASR #32
117         LDRSH   r3, [r2], #2
118         STRB    r4, [r0], #1
119         ADDS    r12,r12,#128
120         CMPGT   r5, r12
121         EORLT   r12,r5, r12,ASR #32
122         LDRSH   r4, [r2], #2
123         STRB    r12,[r0], #1
124         ADDS    r3, r3, #128
125         CMPGT   r5, r3
126         EORLT   r3, r5, r3, ASR #32
127         STRB    r3, [r0], #1
128         ADDS    r4, r4, #128
129         CMPGT   r5, r4
130         EORLT   r4, r5, r4, ASR #32
131         STRB    r4, [r0], r1
132         SUBS    r14,r14,#1
133         BGT     ofrintra_lp_arm
134         LDMFD   r13!,{r4,r5,PC}
135         ENDP
136
137 oc_frag_recon_inter_arm PROC
138         ; r0 =       unsigned char *dst
139         ; r1 = const unsigned char *src
140         ; r2 =       int            ystride
141         ; r3 = const ogg_int16_t    residue[64]
142         STMFD   r13!,{r5,r9-r11,r14}
143         MOV     r9, #8
144         MOV     r5, #255
145         SUB     r2, r2, #7
146 ofrinter_lp_arm
147         LDRSH   r12,[r3], #2
148         LDRB    r14,[r1], #1
149         LDRSH   r11,[r3], #2
150         LDRB    r10,[r1], #1
151         ADDS    r12,r12,r14
152         CMPGT   r5, r12
153         EORLT   r12,r5, r12,ASR #32
154         STRB    r12,[r0], #1
155         ADDS    r11,r11,r10
156         CMPGT   r5, r11
157         LDRSH   r12,[r3], #2
158         LDRB    r14,[r1], #1
159         EORLT   r11,r5, r11,ASR #32
160         STRB    r11,[r0], #1
161         ADDS    r12,r12,r14
162         CMPGT   r5, r12
163         LDRSH   r11,[r3], #2
164         LDRB    r10,[r1], #1
165         EORLT   r12,r5, r12,ASR #32
166         STRB    r12,[r0], #1
167         ADDS    r11,r11,r10
168         CMPGT   r5, r11
169         LDRSH   r12,[r3], #2
170         LDRB    r14,[r1], #1
171         EORLT   r11,r5, r11,ASR #32
172         STRB    r11,[r0], #1
173         ADDS    r12,r12,r14
174         CMPGT   r5, r12
175         LDRSH   r11,[r3], #2
176         LDRB    r10,[r1], #1
177         EORLT   r12,r5, r12,ASR #32
178         STRB    r12,[r0], #1
179         ADDS    r11,r11,r10
180         CMPGT   r5, r11
181         LDRSH   r12,[r3], #2
182         LDRB    r14,[r1], #1
183         EORLT   r11,r5, r11,ASR #32
184         STRB    r11,[r0], #1
185         ADDS    r12,r12,r14
186         CMPGT   r5, r12
187         LDRSH   r11,[r3], #2
188         LDRB    r10,[r1], r2
189         EORLT   r12,r5, r12,ASR #32
190         STRB    r12,[r0], #1
191         ADDS    r11,r11,r10
192         CMPGT   r5, r11
193         EORLT   r11,r5, r11,ASR #32
194         STRB    r11,[r0], r2
195         SUBS    r9, r9, #1
196         BGT     ofrinter_lp_arm
197         LDMFD   r13!,{r5,r9-r11,PC}
198         ENDP
199
200 oc_frag_recon_inter2_arm PROC
201         ; r0 =       unsigned char *dst
202         ; r1 = const unsigned char *src1
203         ; r2 = const unsigned char *src2
204         ; r3 =       int            ystride
205         LDR     r12,[r13]
206         ; r12= const ogg_int16_t    residue[64]
207         STMFD   r13!,{r4-r8,r14}
208         MOV     r14,#8
209         MOV     r8, #255
210         SUB     r3, r3, #7
211 ofrinter2_lp_arm
212         LDRB    r5, [r1], #1
213         LDRB    r6, [r2], #1
214         LDRSH   r4, [r12],#2
215         LDRB    r7, [r1], #1
216         ADD     r5, r5, r6
217         ADDS    r5, r4, r5, LSR #1
218         CMPGT   r8, r5
219         LDRB    r6, [r2], #1
220         LDRSH   r4, [r12],#2
221         EORLT   r5, r8, r5, ASR #32
222         STRB    r5, [r0], #1
223         ADD     r7, r7, r6
224         ADDS    r7, r4, r7, LSR #1
225         CMPGT   r8, r7
226         LDRB    r5, [r1], #1
227         LDRB    r6, [r2], #1
228         LDRSH   r4, [r12],#2
229         EORLT   r7, r8, r7, ASR #32
230         STRB    r7, [r0], #1
231         ADD     r5, r5, r6
232         ADDS    r5, r4, r5, LSR #1
233         CMPGT   r8, r5
234         LDRB    r7, [r1], #1
235         LDRB    r6, [r2], #1
236         LDRSH   r4, [r12],#2
237         EORLT   r5, r8, r5, ASR #32
238         STRB    r5, [r0], #1
239         ADD     r7, r7, r6
240         ADDS    r7, r4, r7, LSR #1
241         CMPGT   r8, r7
242         LDRB    r5, [r1], #1
243         LDRB    r6, [r2], #1
244         LDRSH   r4, [r12],#2
245         EORLT   r7, r8, r7, ASR #32
246         STRB    r7, [r0], #1
247         ADD     r5, r5, r6
248         ADDS    r5, r4, r5, LSR #1
249         CMPGT   r8, r5
250         LDRB    r7, [r1], #1
251         LDRB    r6, [r2], #1
252         LDRSH   r4, [r12],#2
253         EORLT   r5, r8, r5, ASR #32
254         STRB    r5, [r0], #1
255         ADD     r7, r7, r6
256         ADDS    r7, r4, r7, LSR #1
257         CMPGT   r8, r7
258         LDRB    r5, [r1], #1
259         LDRB    r6, [r2], #1
260         LDRSH   r4, [r12],#2
261         EORLT   r7, r8, r7, ASR #32
262         STRB    r7, [r0], #1
263         ADD     r5, r5, r6
264         ADDS    r5, r4, r5, LSR #1
265         CMPGT   r8, r5
266         LDRB    r7, [r1], r3
267         LDRB    r6, [r2], r3
268         LDRSH   r4, [r12],#2
269         EORLT   r5, r8, r5, ASR #32
270         STRB    r5, [r0], #1
271         ADD     r7, r7, r6
272         ADDS    r7, r4, r7, LSR #1
273         CMPGT   r8, r7
274         EORLT   r7, r8, r7, ASR #32
275         STRB    r7, [r0], r3
276         SUBS    r14,r14,#1
277         BGT     ofrinter2_lp_arm
278         LDMFD   r13!,{r4-r8,PC}
279         ENDP
280
281  [ OC_ARM_ASM_EDSP
282         EXPORT  oc_frag_copy_list_edsp
283
284 oc_frag_copy_list_edsp PROC
285         ; r0 = _dst_frame
286         ; r1 = _src_frame
287         ; r2 = _ystride
288         ; r3 = _fragis
289         ; <> = _nfragis
290         ; <> = _frag_buf_offs
291         LDR     r12,[r13]               ; r12 = _nfragis
292         STMFD   r13!,{r4-r11,r14}
293         SUBS    r12, r12, #1
294         LDRGE   r5, [r3],#4             ; r5 = _fragis[fragii]
295         LDRGE   r14,[r13,#4*10]         ; r14 = _frag_buf_offs
296         BLT     ofcl_edsp_end
297 ofcl_edsp_lp
298         MOV     r4, r1
299         LDR     r5, [r14,r5, LSL #2]    ; r5 = _frag_buf_offs[_fragis[fragii]]
300         SUBS    r12, r12, #1
301         ; Stall (on XScale)
302         LDRD    r6, [r4, r5]!           ; r4 = _src_frame+frag_buf_off
303         LDRD    r8, [r4, r2]!
304         ; Stall
305         STRD    r6, [r5, r0]!           ; r5 = _dst_frame+frag_buf_off
306         STRD    r8, [r5, r2]!
307         ; Stall
308         LDRD    r6, [r4, r2]!   ; On Xscale at least, doing 3 consecutive
309         LDRD    r8, [r4, r2]!   ; loads causes a stall, but that's no worse
310         LDRD    r10,[r4, r2]!   ; than us only doing 2, and having to do
311                                 ; another pair of LDRD/STRD later on.
312         ; Stall
313         STRD    r6, [r5, r2]!
314         STRD    r8, [r5, r2]!
315         STRD    r10,[r5, r2]!
316         LDRD    r6, [r4, r2]!
317         LDRD    r8, [r4, r2]!
318         LDRD    r10,[r4, r2]!
319         STRD    r6, [r5, r2]!
320         STRD    r8, [r5, r2]!
321         STRD    r10,[r5, r2]!
322         LDRGE   r5, [r3],#4             ; r5 = _fragis[fragii]
323         BGE     ofcl_edsp_lp
324 ofcl_edsp_end
325         LDMFD   r13!,{r4-r11,PC}
326         ENDP
327  ]
328
329  [ OC_ARM_ASM_MEDIA
330         EXPORT  oc_frag_recon_intra_v6
331         EXPORT  oc_frag_recon_inter_v6
332         EXPORT  oc_frag_recon_inter2_v6
333
334 oc_frag_recon_intra_v6 PROC
335         ; r0 =       unsigned char *_dst
336         ; r1 =       int            _ystride
337         ; r2 = const ogg_int16_t    _residue[64]
338         STMFD   r13!,{r4-r6,r14}
339         MOV     r14,#8
340         MOV     r12,r2
341         LDR     r6, =0x00800080
342 ofrintra_v6_lp
343         LDRD    r2, [r12],#8    ; r2 = 11110000 r3 = 33332222
344         LDRD    r4, [r12],#8    ; r4 = 55554444 r5 = 77776666
345         SUBS    r14,r14,#1
346         QADD16  r2, r2, r6
347         QADD16  r3, r3, r6
348         QADD16  r4, r4, r6
349         QADD16  r5, r5, r6
350         USAT16  r2, #8, r2              ; r2 = __11__00
351         USAT16  r3, #8, r3              ; r3 = __33__22
352         USAT16  r4, #8, r4              ; r4 = __55__44
353         USAT16  r5, #8, r5              ; r5 = __77__66
354         ORR     r2, r2, r2, LSR #8      ; r2 = __111100
355         ORR     r3, r3, r3, LSR #8      ; r3 = __333322
356         ORR     r4, r4, r4, LSR #8      ; r4 = __555544
357         ORR     r5, r5, r5, LSR #8      ; r5 = __777766
358         PKHBT   r2, r2, r3, LSL #16     ; r2 = 33221100
359         PKHBT   r3, r4, r5, LSL #16     ; r3 = 77665544
360         STRD    r2, [r0], r1
361         BGT     ofrintra_v6_lp
362         LDMFD   r13!,{r4-r6,PC}
363         ENDP
364
365 oc_frag_recon_inter_v6 PROC
366         ; r0 =       unsigned char *_dst
367         ; r1 = const unsigned char *_src
368         ; r2 =       int            _ystride
369         ; r3 = const ogg_int16_t    _residue[64]
370         STMFD   r13!,{r4-r7,r14}
371         MOV     r14,#8
372 ofrinter_v6_lp
373         LDRD    r6, [r3], #8            ; r6 = 11110000 r7 = 33332222
374         SUBS    r14,r14,#1
375  [ OC_ARM_CAN_UNALIGN_LDRD
376         LDRD    r4, [r1], r2    ; Unaligned ; r4 = 33221100 r5 = 77665544
377  |
378         LDR     r5, [r1, #4]
379         LDR     r4, [r1], r2
380  ]
381         PKHBT   r12,r6, r7, LSL #16     ; r12= 22220000
382         PKHTB   r7, r7, r6, ASR #16     ; r7 = 33331111
383         UXTB16  r6,r4                   ; r6 = __22__00
384         UXTB16  r4,r4, ROR #8           ; r4 = __33__11
385         QADD16  r12,r12,r6              ; r12= xx22xx00
386         QADD16  r4, r7, r4              ; r4 = xx33xx11
387         LDRD    r6, [r3], #8            ; r6 = 55554444 r7 = 77776666
388         USAT16  r4, #8, r4              ; r4 = __33__11
389         USAT16  r12,#8,r12              ; r12= __22__00
390         ORR     r4, r12,r4, LSL #8      ; r4 = 33221100
391         PKHBT   r12,r6, r7, LSL #16     ; r12= 66664444
392         PKHTB   r7, r7, r6, ASR #16     ; r7 = 77775555
393         UXTB16  r6,r5                   ; r6 = __66__44
394         UXTB16  r5,r5, ROR #8           ; r5 = __77__55
395         QADD16  r12,r12,r6              ; r12= xx66xx44
396         QADD16  r5, r7, r5              ; r5 = xx77xx55
397         USAT16  r12,#8, r12             ; r12= __66__44
398         USAT16  r5, #8, r5              ; r4 = __77__55
399         ORR     r5, r12,r5, LSL #8      ; r5 = 33221100
400         STRD    r4, [r0], r2
401         BGT     ofrinter_v6_lp
402         LDMFD   r13!,{r4-r7,PC}
403         ENDP
404
405 oc_frag_recon_inter2_v6 PROC
406         ; r0 =       unsigned char *_dst
407         ; r1 = const unsigned char *_src1
408         ; r2 = const unsigned char *_src2
409         ; r3 =       int            _ystride
410         LDR     r12,[r13]
411         ; r12= const ogg_int16_t    _residue[64]
412         STMFD   r13!,{r4-r9,r14}
413         MOV     r14,#8
414 ofrinter2_v6_lp
415         LDRD    r6, [r12,#8]    ; r6 = 55554444 r7 = 77776666
416         SUBS    r14,r14,#1
417         LDR     r4, [r1, #4]    ; Unaligned     ; r4 = src1[1] = 77665544
418         LDR     r5, [r2, #4]    ; Unaligned     ; r5 = src2[1] = 77665544
419         PKHBT   r8, r6, r7, LSL #16     ; r8 = 66664444
420         PKHTB   r9, r7, r6, ASR #16     ; r9 = 77775555
421         UHADD8  r4, r4, r5      ; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
422         UXTB16  r5, r4                  ; r5 = __66__44
423         UXTB16  r4, r4, ROR #8          ; r4 = __77__55
424         QADD16  r8, r8, r5              ; r8 = xx66xx44
425         QADD16  r9, r9, r4              ; r9 = xx77xx55
426         LDRD    r6,[r12],#16    ; r6 = 33332222 r7 = 11110000
427         USAT16  r8, #8, r8              ; r8 = __66__44
428         LDR     r4, [r1], r3    ; Unaligned     ; r4 = src1[0] = 33221100
429         USAT16  r9, #8, r9              ; r9 = __77__55
430         LDR     r5, [r2], r3    ; Unaligned     ; r5 = src2[0] = 33221100
431         ORR     r9, r8, r9, LSL #8      ; r9 = 77665544
432         PKHBT   r8, r6, r7, LSL #16     ; r8 = 22220000
433         UHADD8  r4, r4, r5      ; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
434         PKHTB   r7, r7, r6, ASR #16     ; r7 = 33331111
435         UXTB16  r5, r4                  ; r5 = __22__00
436         UXTB16  r4, r4, ROR #8          ; r4 = __33__11
437         QADD16  r8, r8, r5              ; r8 = xx22xx00
438         QADD16  r7, r7, r4              ; r7 = xx33xx11
439         USAT16  r8, #8, r8              ; r8 = __22__00
440         USAT16  r7, #8, r7              ; r7 = __33__11
441         ORR     r8, r8, r7, LSL #8      ; r8 = 33221100
442         STRD    r8, [r0], r3
443         BGT     ofrinter2_v6_lp
444         LDMFD   r13!,{r4-r9,PC}
445         ENDP
446  ]
447
448  [ OC_ARM_ASM_NEON
449         EXPORT  oc_frag_copy_list_neon
450         EXPORT  oc_frag_recon_intra_neon
451         EXPORT  oc_frag_recon_inter_neon
452         EXPORT  oc_frag_recon_inter2_neon
453
454 oc_frag_copy_list_neon PROC
455         ; r0 = _dst_frame
456         ; r1 = _src_frame
457         ; r2 = _ystride
458         ; r3 = _fragis
459         ; <> = _nfragis
460         ; <> = _frag_buf_offs
461         LDR     r12,[r13]               ; r12 = _nfragis
462         STMFD   r13!,{r4-r7,r14}
463         CMP     r12, #1
464         LDRGE   r6, [r3]                ; r6 = _fragis[fragii]
465         LDRGE   r14,[r13,#4*6]          ; r14 = _frag_buf_offs
466         BLT     ofcl_neon_end
467         ; Stall (2 on Xscale)
468         LDR     r6, [r14,r6, LSL #2]    ; r6 = _frag_buf_offs[_fragis[fragii]]
469         ; Stall (on XScale)
470         MOV     r7, r6                  ; Guarantee PLD points somewhere valid.
471 ofcl_neon_lp
472         ADD     r4, r1, r6
473         VLD1.64 {D0}, [r4@64], r2
474         ADD     r5, r0, r6
475         VLD1.64 {D1}, [r4@64], r2
476         SUBS    r12, r12, #1
477         VLD1.64 {D2}, [r4@64], r2
478         LDRGT   r6, [r3,#4]!            ; r6 = _fragis[fragii]
479         VLD1.64 {D3}, [r4@64], r2
480         LDRGT   r6, [r14,r6, LSL #2]    ; r6 = _frag_buf_offs[_fragis[fragii]]
481         VLD1.64 {D4}, [r4@64], r2
482         ADDGT   r7, r1, r6
483         VLD1.64 {D5}, [r4@64], r2
484         PLD     [r7]
485         VLD1.64 {D6}, [r4@64], r2
486         PLD     [r7, r2]
487         VLD1.64 {D7}, [r4@64]
488         PLD     [r7, r2, LSL #1]
489         VST1.64 {D0}, [r5@64], r2
490         ADDGT   r7, r7, r2, LSL #2
491         VST1.64 {D1}, [r5@64], r2
492         PLD     [r7, -r2]
493         VST1.64 {D2}, [r5@64], r2
494         PLD     [r7]
495         VST1.64 {D3}, [r5@64], r2
496         PLD     [r7, r2]
497         VST1.64 {D4}, [r5@64], r2
498         PLD     [r7, r2, LSL #1]
499         VST1.64 {D5}, [r5@64], r2
500         ADDGT   r7, r7, r2, LSL #2
501         VST1.64 {D6}, [r5@64], r2
502         PLD     [r7, -r2]
503         VST1.64 {D7}, [r5@64]
504         BGT     ofcl_neon_lp
505 ofcl_neon_end
506         LDMFD   r13!,{r4-r7,PC}
507         ENDP
508
509 oc_frag_recon_intra_neon PROC
510         ; r0 =       unsigned char *_dst
511         ; r1 =       int            _ystride
512         ; r2 = const ogg_int16_t    _residue[64]
513         MOV     r3, #128
514         VDUP.S16        Q0, r3
515         VLDMIA  r2,  {D16-D31}  ; D16= 3333222211110000 etc     ; 9(8) cycles
516         VQADD.S16       Q8, Q8, Q0
517         VQADD.S16       Q9, Q9, Q0
518         VQADD.S16       Q10,Q10,Q0
519         VQADD.S16       Q11,Q11,Q0
520         VQADD.S16       Q12,Q12,Q0
521         VQADD.S16       Q13,Q13,Q0
522         VQADD.S16       Q14,Q14,Q0
523         VQADD.S16       Q15,Q15,Q0
524         VQMOVUN.S16     D16,Q8  ; D16= 7766554433221100         ; 1 cycle
525         VQMOVUN.S16     D17,Q9  ; D17= FFEEDDCCBBAA9988         ; 1 cycle
526         VQMOVUN.S16     D18,Q10 ; D18= NNMMLLKKJJIIHHGG         ; 1 cycle
527         VST1.64 {D16},[r0@64], r1
528         VQMOVUN.S16     D19,Q11 ; D19= VVUUTTSSRRQQPPOO         ; 1 cycle
529         VST1.64 {D17},[r0@64], r1
530         VQMOVUN.S16     D20,Q12 ; D20= ddccbbaaZZYYXXWW         ; 1 cycle
531         VST1.64 {D18},[r0@64], r1
532         VQMOVUN.S16     D21,Q13 ; D21= llkkjjiihhggffee         ; 1 cycle
533         VST1.64 {D19},[r0@64], r1
534         VQMOVUN.S16     D22,Q14 ; D22= ttssrrqqppoonnmm         ; 1 cycle
535         VST1.64 {D20},[r0@64], r1
536         VQMOVUN.S16     D23,Q15 ; D23= !!@@zzyyxxwwvvuu         ; 1 cycle
537         VST1.64 {D21},[r0@64], r1
538         VST1.64 {D22},[r0@64], r1
539         VST1.64 {D23},[r0@64], r1
540         MOV     PC,R14
541         ENDP
542
543 oc_frag_recon_inter_neon PROC
544         ; r0 =       unsigned char *_dst
545         ; r1 = const unsigned char *_src
546         ; r2 =       int            _ystride
547         ; r3 = const ogg_int16_t    _residue[64]
548         VLDMIA  r3, {D16-D31}   ; D16= 3333222211110000 etc     ; 9(8) cycles
549         VLD1.64 {D0}, [r1], r2
550         VLD1.64 {D2}, [r1], r2
551         VMOVL.U8        Q0, D0  ; Q0 = __77__66__55__44__33__22__11__00
552         VLD1.64 {D4}, [r1], r2
553         VMOVL.U8        Q1, D2  ; etc
554         VLD1.64 {D6}, [r1], r2
555         VMOVL.U8        Q2, D4
556         VMOVL.U8        Q3, D6
557         VQADD.S16       Q8, Q8, Q0
558         VLD1.64 {D0}, [r1], r2
559         VQADD.S16       Q9, Q9, Q1
560         VLD1.64 {D2}, [r1], r2
561         VQADD.S16       Q10,Q10,Q2
562         VLD1.64 {D4}, [r1], r2
563         VQADD.S16       Q11,Q11,Q3
564         VLD1.64 {D6}, [r1], r2
565         VMOVL.U8        Q0, D0
566         VMOVL.U8        Q1, D2
567         VMOVL.U8        Q2, D4
568         VMOVL.U8        Q3, D6
569         VQADD.S16       Q12,Q12,Q0
570         VQADD.S16       Q13,Q13,Q1
571         VQADD.S16       Q14,Q14,Q2
572         VQADD.S16       Q15,Q15,Q3
573         VQMOVUN.S16     D16,Q8
574         VQMOVUN.S16     D17,Q9
575         VQMOVUN.S16     D18,Q10
576         VST1.64 {D16},[r0@64], r2
577         VQMOVUN.S16     D19,Q11
578         VST1.64 {D17},[r0@64], r2
579         VQMOVUN.S16     D20,Q12
580         VST1.64 {D18},[r0@64], r2
581         VQMOVUN.S16     D21,Q13
582         VST1.64 {D19},[r0@64], r2
583         VQMOVUN.S16     D22,Q14
584         VST1.64 {D20},[r0@64], r2
585         VQMOVUN.S16     D23,Q15
586         VST1.64 {D21},[r0@64], r2
587         VST1.64 {D22},[r0@64], r2
588         VST1.64 {D23},[r0@64], r2
589         MOV     PC,R14
590         ENDP
591
592 oc_frag_recon_inter2_neon PROC
593         ; r0 =       unsigned char *_dst
594         ; r1 = const unsigned char *_src1
595         ; r2 = const unsigned char *_src2
596         ; r3 =       int            _ystride
597         LDR     r12,[r13]
598         ; r12= const ogg_int16_t    _residue[64]
599         VLDMIA  r12,{D16-D31}
600         VLD1.64 {D0}, [r1], r3
601         VLD1.64 {D4}, [r2], r3
602         VLD1.64 {D1}, [r1], r3
603         VLD1.64 {D5}, [r2], r3
604         VHADD.U8        Q2, Q0, Q2      ; Q2 = FFEEDDCCBBAA99887766554433221100
605         VLD1.64 {D2}, [r1], r3
606         VLD1.64 {D6}, [r2], r3
607         VMOVL.U8        Q0, D4          ; Q0 = __77__66__55__44__33__22__11__00
608         VLD1.64 {D3}, [r1], r3
609         VMOVL.U8        Q2, D5          ; etc
610         VLD1.64 {D7}, [r2], r3
611         VHADD.U8        Q3, Q1, Q3
612         VQADD.S16       Q8, Q8, Q0
613         VQADD.S16       Q9, Q9, Q2
614         VLD1.64 {D0}, [r1], r3
615         VMOVL.U8        Q1, D6
616         VLD1.64 {D4}, [r2], r3
617         VMOVL.U8        Q3, D7
618         VLD1.64 {D1}, [r1], r3
619         VQADD.S16       Q10,Q10,Q1
620         VLD1.64 {D5}, [r2], r3
621         VQADD.S16       Q11,Q11,Q3
622         VLD1.64 {D2}, [r1], r3
623         VHADD.U8        Q2, Q0, Q2
624         VLD1.64 {D6}, [r2], r3
625         VLD1.64 {D3}, [r1], r3
626         VMOVL.U8        Q0, D4
627         VLD1.64 {D7}, [r2], r3
628         VMOVL.U8        Q2, D5
629         VHADD.U8        Q3, Q1, Q3
630         VQADD.S16       Q12,Q12,Q0
631         VQADD.S16       Q13,Q13,Q2
632         VMOVL.U8        Q1, D6
633         VMOVL.U8        Q3, D7
634         VQADD.S16       Q14,Q14,Q1
635         VQADD.S16       Q15,Q15,Q3
636         VQMOVUN.S16     D16,Q8
637         VQMOVUN.S16     D17,Q9
638         VQMOVUN.S16     D18,Q10
639         VST1.64 {D16},[r0@64], r3
640         VQMOVUN.S16     D19,Q11
641         VST1.64 {D17},[r0@64], r3
642         VQMOVUN.S16     D20,Q12
643         VST1.64 {D18},[r0@64], r3
644         VQMOVUN.S16     D21,Q13
645         VST1.64 {D19},[r0@64], r3
646         VQMOVUN.S16     D22,Q14
647         VST1.64 {D20},[r0@64], r3
648         VQMOVUN.S16     D23,Q15
649         VST1.64 {D21},[r0@64], r3
650         VST1.64 {D22},[r0@64], r3
651         VST1.64 {D23},[r0@64], r3
652         MOV     PC,R14
653         ENDP
654  ]
655
656         END