Fix celt_pitch_xcorr_c signature.
[opus.git] / celt / x86 / pitch_sse.h
1 /* Copyright (c) 2013 Jean-Marc Valin and John Ridges
2    Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/
3 /**
4    @file pitch_sse.h
5    @brief Pitch analysis
6  */
7
8 /*
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions
11    are met:
12
13    - Redistributions of source code must retain the above copyright
14    notice, this list of conditions and the following disclaimer.
15
16    - Redistributions in binary form must reproduce the above copyright
17    notice, this list of conditions and the following disclaimer in the
18    documentation and/or other materials provided with the distribution.
19
20    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
24    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #ifndef PITCH_SSE_H
34 #define PITCH_SSE_H
35
36 #if defined(HAVE_CONFIG_H)
37 #include "config.h"
38 #endif
39
40 #if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
41 #if defined(OPUS_X86_MAY_HAVE_SSE4_1)
42 void xcorr_kernel_sse4_1(
43                     const opus_int16 *x,
44                     const opus_int16 *y,
45                     opus_val32       sum[4],
46                     int              len);
47
48 extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
49                     const opus_int16 *x,
50                     const opus_int16 *y,
51                     opus_val32       sum[4],
52                     int              len);
53
54 #define OVERRIDE_XCORR_KERNEL
55 #define xcorr_kernel(x, y, sum, len, arch) \
56     ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
57
58 opus_val32 celt_inner_prod_sse4_1(
59     const opus_int16 *x,
60     const opus_int16 *y,
61     int               N);
62 #endif
63
64 #if defined(OPUS_X86_MAY_HAVE_SSE2)
65 opus_val32 celt_inner_prod_sse2(
66     const opus_int16 *x,
67     const opus_int16 *y,
68     int               N);
69 #endif
70
71 extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
72                     const opus_int16 *x,
73                     const opus_int16 *y,
74                     int               N);
75
76 #define OVERRIDE_CELT_INNER_PROD
77 #define celt_inner_prod(x, y, N, arch) \
78     ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
79 #else
80
81 #include <xmmintrin.h>
82 #include "arch.h"
83
84 #define OVERRIDE_XCORR_KERNEL
85 static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
86 {
87    int j;
88    __m128 xsum1, xsum2;
89    xsum1 = _mm_loadu_ps(sum);
90    xsum2 = _mm_setzero_ps();
91
92    for (j = 0; j < len-3; j += 4)
93    {
94       __m128 x0 = _mm_loadu_ps(x+j);
95       __m128 yj = _mm_loadu_ps(y+j);
96       __m128 y3 = _mm_loadu_ps(y+j+3);
97
98       xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
99       xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
100                                           _mm_shuffle_ps(yj,y3,0x49)));
101       xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
102                                           _mm_shuffle_ps(yj,y3,0x9e)));
103       xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
104    }
105    if (j < len)
106    {
107       xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
108       if (++j < len)
109       {
110          xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
111          if (++j < len)
112          {
113             xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
114          }
115       }
116    }
117    _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
118 }
119
120 #define xcorr_kernel(_x, _y, _z, len, arch) \
121     ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len))
122
123 #define OVERRIDE_DUAL_INNER_PROD
124 static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
125       int N, opus_val32 *xy1, opus_val32 *xy2)
126 {
127    int i;
128    __m128 xsum1, xsum2;
129    xsum1 = _mm_setzero_ps();
130    xsum2 = _mm_setzero_ps();
131    for (i=0;i<N-3;i+=4)
132    {
133       __m128 xi = _mm_loadu_ps(x+i);
134       __m128 y1i = _mm_loadu_ps(y01+i);
135       __m128 y2i = _mm_loadu_ps(y02+i);
136       xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
137       xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
138    }
139    /* Horizontal sum */
140    xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
141    xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
142    _mm_store_ss(xy1, xsum1);
143    xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
144    xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
145    _mm_store_ss(xy2, xsum2);
146    for (;i<N;i++)
147    {
148       *xy1 = MAC16_16(*xy1, x[i], y01[i]);
149       *xy2 = MAC16_16(*xy2, x[i], y02[i]);
150    }
151 }
152
153 #define OVERRIDE_CELT_INNER_PROD
154 static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
155       int N)
156 {
157    int i;
158    float xy;
159    __m128 sum;
160    sum = _mm_setzero_ps();
161    /* FIXME: We should probably go 8-way and use 2 sums. */
162    for (i=0;i<N-3;i+=4)
163    {
164       __m128 xi = _mm_loadu_ps(x+i);
165       __m128 yi = _mm_loadu_ps(y+i);
166       sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
167    }
168    /* Horizontal sum */
169    sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
170    sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
171    _mm_store_ss(&xy, sum);
172    for (;i<N;i++)
173    {
174       xy = MAC16_16(xy, x[i], y[i]);
175    }
176    return xy;
177 }
178
179 #  define celt_inner_prod(_x, _y, len, arch) \
180     ((void)(arch),celt_inner_prod_sse(_x, _y, len))
181
182 #define OVERRIDE_COMB_FILTER_CONST
183 static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
184       opus_val16 g10, opus_val16 g11, opus_val16 g12)
185 {
186    int i;
187    __m128 x0v;
188    __m128 g10v, g11v, g12v;
189    g10v = _mm_load1_ps(&g10);
190    g11v = _mm_load1_ps(&g11);
191    g12v = _mm_load1_ps(&g12);
192    x0v = _mm_loadu_ps(&x[-T-2]);
193    for (i=0;i<N-3;i+=4)
194    {
195       __m128 yi, yi2, x1v, x2v, x3v, x4v;
196       const opus_val32 *xp = &x[i-T-2];
197       yi = _mm_loadu_ps(x+i);
198       x4v = _mm_loadu_ps(xp+4);
199 #if 0
200       /* Slower version with all loads */
201       x1v = _mm_loadu_ps(xp+1);
202       x2v = _mm_loadu_ps(xp+2);
203       x3v = _mm_loadu_ps(xp+3);
204 #else
205       x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
206       x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
207       x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
208 #endif
209
210       yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
211 #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
212       yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
213       yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
214 #else
215       /* Use partial sums */
216       yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
217                        _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
218       yi = _mm_add_ps(yi, yi2);
219 #endif
220       x0v=x4v;
221       _mm_storeu_ps(y+i, yi);
222    }
223 #ifdef CUSTOM_MODES
224    for (;i<N;i++)
225    {
226       y[i] = x[i]
227                + MULT16_32_Q15(g10,x[i-T])
228                + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
229                + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
230    }
231 #endif
232 }
233
234 #endif
235 #endif