?? imdct.c
字號:
two_m = (1 << m);
else
two_m = 1;
two_m_plus_one = (1 << (m+1));
for(i = 0; i < 128; i += two_m_plus_one) {
for(k = 0; k < two_m; k++) {
p = k + i;
q = p + two_m;
tmp_a_r = buf[p].real;
tmp_a_i = buf[p].imag;
tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
buf[p].real = tmp_a_r + tmp_b_r;
buf[p].imag = tmp_a_i + tmp_b_i;
buf[q].real = tmp_a_r - tmp_b_r;
buf[q].imag = tmp_a_i - tmp_b_i;
}
}
}
*/
/* 1. iteration */
// Note w[0][0]={1,0}
{
__m128 xmm0,xmm1,xmm2;
xorps (xmm1, xmm1);
xorps (xmm2, xmm2);
for (unsigned char *esi=(unsigned char*)buf;(complex_t*)esi<buf+128;esi+=16){
movlps (esi, xmm0); //buf[p]
movlps (8+esi, xmm1); //buf[q]
movhps (esi, xmm0); //buf[p]
movhps (8+esi, xmm2); //buf[q]
addps (xmm1, xmm0);
subps (xmm2, xmm0);
movaps (xmm0, esi);
}
}
/* 2. iteration */
// Note w[1]={{1,0}, {0,-1}}
{
__m128 xmm7,xmm0,xmm1,xmm2;
movaps (ps111_1, xmm7); // 1,1,1,-1
for (unsigned char *esi=(unsigned char*)buf;(complex_t*)esi<buf+128;esi+=32){
movaps (16+esi, xmm2); //r2,i2,r3,i3
xmm2=_mm_shuffle_ps(xmm2,xmm2,0xB4); //r2,i2,i3,r3
mulps (xmm7, xmm2); //r2,i2,i3,-r3
movaps (esi, xmm0); //r0,i0,r1,i1
movaps (esi, xmm1); //r0,i0,r1,i1
addps (xmm2, xmm0);
subps (xmm2, xmm1);
movaps (xmm0, esi);
movaps (xmm1, 16+esi);
}
};
/* 3. iteration */
/*
Note sseW2+0={1,1,sqrt(2),sqrt(2))
Note sseW2+16={0,0,sqrt(2),-sqrt(2))
Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
*/
{
__m128 xmm6,xmm7,xmm5,xmm2,xmm3,xmm4,xmm0,xmm1;
movaps (48+(uint8_t*)sseW2, xmm6);
movaps (16+(uint8_t*)sseW2, xmm7);
xorps (xmm5, xmm5);
xorps (xmm2, xmm2);
for (unsigned char *esi=(unsigned char*)buf;(complex_t*)esi<buf+128;esi+=64){
movaps (32+esi, xmm2); //r4,i4,r5,i5
movaps (48+esi, xmm3); //r6,i6,r7,i7
movaps ((uint8_t*)sseW2, xmm4); //r4,i4,r5,i5
movaps (32+(uint8_t*)sseW2, xmm5); //r6,i6,r7,i7
mulps (xmm2, xmm4);
mulps (xmm3, xmm5);
xmm2=_mm_shuffle_ps(xmm2,xmm2,0xB1); //i4,r4,i5,r5
xmm3=_mm_shuffle_ps(xmm3,xmm3,0xB1); //i6,r6,i7,r7
mulps (xmm6, xmm3);
mulps (xmm7, xmm2);
movaps (esi, xmm0); //r0,i0,r1,i1
movaps (16+esi, xmm1); //r2,i2,r3,i3
addps (xmm4, xmm2);
addps (xmm5, xmm3);
movaps (xmm2, xmm4);
movaps (xmm3, xmm5);
addps (xmm0, xmm2);
addps (xmm1, xmm3);
subps (xmm4, xmm0);
subps (xmm5, xmm1);
movaps (xmm2, esi);
movaps (xmm3, 16+esi);
movaps (xmm0, 32+esi);
movaps (xmm1, 48+esi);
}
}
/* 4-7. iterations */
for (m=3; m < 7; m++) {
two_m = (1 << m);
two_m_plus_one = two_m<<1;
{
__m128 xmm1,xmm2,xmm0;
for (unsigned char *esi=(unsigned char*)buf;(complex_t*)esi<buf+128;esi+=two_m_plus_one<<3){
int edi=0; // k
for (unsigned char *edx=esi+ (two_m<<3);edi< (two_m<<3);edi+=16){
movaps (edx+ edi, xmm1);
movaps ((uint8_t*)sseW[m]+ edi* 2, xmm2);
mulps (xmm1, xmm2);
xmm1=_mm_shuffle_ps(xmm1,xmm1,0xB1);
mulps (16+(uint8_t*)sseW[m]+ edi* 2, xmm1);
movaps (esi+ edi, xmm0);
addps (xmm2, xmm1);
movaps( xmm1, xmm2);
addps (xmm0, xmm1);
subps (xmm2, xmm0);
movaps (xmm1, esi+ edi);
movaps (xmm0, edx+ edi);
}
}
}
}
/* Post IFFT complex multiply plus IFFT complex conjugate*/
{
__m128 xmm0,xmm1;
for (int esi=-1024;esi!=0;esi+=16){
movaps ((uint8_t*)(buf+128)+ esi, xmm0);
movaps ((uint8_t*)(buf+128)+ esi, xmm1);
xmm0=_mm_shuffle_ps(xmm0,xmm0,0xB1);
mulps (1024+(uint8_t*)sseSinCos1c+esi, xmm1);
mulps (1024+(uint8_t*)sseSinCos1d+esi, xmm0);
addps (xmm1, xmm0);
movaps (xmm0, (uint8_t*)(buf+128)+ esi);
}
}
data_ptr = data;
delay_ptr = delay;
//window_ptr = imdct_window;
/* Window and convert to real valued signal */
{
__m128 xmm0,xmm1,xmm2;
movss (bias, xmm2); // bias
xmm2=_mm_shuffle_ps(xmm2,xmm2,0x00); // bias, bias, ...
for (int esi=0,edi=0;esi<512;esi+=16,edi-=16){
movlps ((uint8_t*)(buf+64)+ esi, xmm0); // ? ? A ?
movlps (8+(uint8_t*)(buf+64)+ esi, xmm1); // ? ? C ?
movhps (-16+(uint8_t*)(buf+64)+ edi, xmm1); // ? D C ?
movhps (-8+(uint8_t*)(buf+64)+ edi, xmm0); // ? B A ?
xmm0=_mm_shuffle_ps(xmm0,xmm1,0x99);// shufps $0x99, xmm1, xmm0 // D C B A
mulps ((uint8_t*)sseWindow+esi, xmm0);
addps ((uint8_t*)delay_ptr+ esi, xmm0);
addps (xmm2, xmm0);
movaps (xmm0, (uint8_t*)data_ptr+ esi);
}
}
data_ptr+=128;
delay_ptr+=128;
// window_ptr+=128;
{
__m128 xmm0,xmm1,xmm2;
movss (bias, xmm2); // bias
xmm2=_mm_shuffle_ps(xmm2,xmm2,0x00); // bias, bias, ...
for (int edi=1024,esi=0;esi<512;esi+=16,edi-=16){
movlps ((uint8_t*)buf+ esi, xmm0); // ? ? ? A
movlps (8+(uint8_t*)buf+ esi, xmm1); // ? ? ? C
movhps (-16+(uint8_t*)buf+ edi, xmm1); // D ? ? C
movhps (-8+(uint8_t*)buf+ edi, xmm0); // B ? ? A
xmm0=_mm_shuffle_ps(xmm0,xmm1,0xCC); // D C B A
mulps (512+(uint8_t*)sseWindow+esi, xmm0);
addps ((uint8_t*)delay_ptr+ esi, xmm0);
addps (xmm2, xmm0);
movaps (xmm0, (uint8_t*)data_ptr+ esi);
}
}
data_ptr+=128;
// window_ptr+=128;
/* The trailing edge of the window goes into the delay line */
delay_ptr = delay;
{
__m128 xmm0,xmm1;
for (int edi=0,esi=0;esi<512;esi+=16,edi-=16){
movlps ((uint8_t*)(buf+64)+ esi, xmm0); // ? ? ? A
movlps (8+(uint8_t*)(buf+64)+ esi, xmm1); // ? ? ? C
movhps (-16+(uint8_t*)(buf+64)+ edi, xmm1); // D ? ? C
movhps (-8+(uint8_t*)(buf+64)+ edi, xmm0); // B ? ? A
xmm0=_mm_shuffle_ps(xmm0,xmm1,0xcc);//shufps $0xCC, xmm1, xmm0 // D C B A
mulps (1024+(uint8_t*)sseWindow+esi, xmm0);
movaps (xmm0, (uint8_t*)delay_ptr+ esi);
}
}
delay_ptr+=128;
// window_ptr-=128;
{
__m128 xmm0,xmm1;
for (int edi=1024,esi=0;esi<512;esi+=16,edi-=16){
movlps ((uint8_t*)buf+ esi, xmm0); // ? ? A ?
movlps (8+(uint8_t*)buf+ esi, xmm1); // ? ? C ?
movhps (-16+(uint8_t*)buf+ edi, xmm1); // ? D C ?
movhps (-8+(uint8_t*)buf+ edi, xmm0); // ? B A ?
xmm0=_mm_shuffle_ps(xmm0,xmm1,0x99);// shufps $0x99, xmm1, xmm0 // D C B A
mulps (1536+(uint8_t*)sseWindow+esi, xmm0);
movaps (xmm0, (uint8_t*)delay_ptr+ esi);
}
}
}
static void a52_imdct_256_C(sample_t * data, sample_t * delay, sample_t bias)
{
int i, k;
sample_t t_r, t_i, a_r, a_i, b_r, b_i, c_r, c_i, d_r, d_i, w_1, w_2;
const sample_t * window = a52_imdct_window;
complex_t buf1[64], buf2[64];
/* Pre IFFT complex multiply plus IFFT cmplx conjugate */
for (i = 0; i < 64; i++) {
k = fftorder[i];
t_r = pre2[i].real;
t_i = pre2[i].imag;
buf1[i].real = t_i * data[254-k] + t_r * data[k];
buf1[i].imag = t_r * data[254-k] - t_i * data[k];
buf2[i].real = t_i * data[255-k] + t_r * data[k+1];
buf2[i].imag = t_r * data[255-k] - t_i * data[k+1];
}
ifft64 (buf1);
ifft64 (buf2);
/* Post IFFT complex multiply */
/* Window and convert to real valued signal */
for (i = 0; i < 32; i++) {
/* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
t_r = post2[i].real;
t_i = post2[i].imag;
a_r = t_r * buf1[i].real + t_i * buf1[i].imag;
a_i = t_i * buf1[i].real - t_r * buf1[i].imag;
b_r = t_i * buf1[63-i].real + t_r * buf1[63-i].imag;
b_i = t_r * buf1[63-i].real - t_i * buf1[63-i].imag;
c_r = t_r * buf2[i].real + t_i * buf2[i].imag;
c_i = t_i * buf2[i].real - t_r * buf2[i].imag;
d_r = t_i * buf2[63-i].real + t_r * buf2[63-i].imag;
d_i = t_r * buf2[63-i].real - t_i * buf2[63-i].imag;
w_1 = window[2*i];
w_2 = window[255-2*i];
data[2*i] = delay[2*i] * w_2 - a_r * w_1 + bias;
data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias;
delay[2*i] = c_i;
w_1 = window[128+2*i];
w_2 = window[127-2*i];
data[128+2*i] = delay[127-2*i] * w_2 + a_i * w_1 + bias;
data[127-2*i] = delay[127-2*i] * w_1 - a_i * w_2 + bias;
delay[127-2*i] = c_r;
w_1 = window[2*i+1];
w_2 = window[254-2*i];
data[2*i+1] = delay[2*i+1] * w_2 - b_i * w_1 + bias;
data[254-2*i] = delay[2*i+1] * w_1 + b_i * w_2 + bias;
delay[2*i+1] = d_r;
w_1 = window[129+2*i];
w_2 = window[126-2*i];
data[129+2*i] = delay[126-2*i] * w_2 + b_r * w_1 + bias;
data[126-2*i] = delay[126-2*i] * w_1 - b_r * w_2 + bias;
delay[126-2*i] = d_i;
}
}
static double besselI0 (double x)
{
double bessel = 1;
int i = 100;
do
bessel = bessel * x / (i * i) + 1;
while (--i);
return bessel;
}
void a52_imdct_init (uint32_t mm_accel)
{
int i, k;
double sum;
sample_t local_imdct_window[256];
/* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */
sum = 0;
for (i = 0; i < 256; i++) {
sum += besselI0 (i * (256 - i) * (5 * M_PI / 256) * (5 * M_PI / 256));
local_imdct_window[i] = sum;
}
sum++;
for (i = 0; i < 256; i++)
a52_imdct_window[i] = (sample_t) (sqrt (local_imdct_window[i] / sum));
for (i = 0; i < 3; i++)
roots16[i] = cos ((M_PI / 8) * (i + 1));
for (i = 0; i < 7; i++)
roots32[i] = cos ((M_PI / 16) * (i + 1));
for (i = 0; i < 15; i++)
roots64[i] = cos ((M_PI / 32) * (i + 1));
for (i = 0; i < 31; i++)
roots128[i] = cos ((M_PI / 64) * (i + 1));
for (i = 0; i < 64; i++) {
k = fftorder[i] / 2 + 64;
pre1[i].real = cos ((M_PI / 256) * (k - 0.25));
pre1[i].imag = sin ((M_PI / 256) * (k - 0.25));
}
for (i = 64; i < 128; i++) {
k = fftorder[i] / 2 + 64;
pre1[i].real = -cos ((M_PI / 256) * (k - 0.25));
pre1[i].imag = -sin ((M_PI / 256) * (k - 0.25));
}
for (i = 0; i < 64; i++) {
post1[i].real = cos ((M_PI / 256) * (i + 0.5));
post1[i].imag = sin ((M_PI / 256) * (i + 0.5));
}
for (i = 0; i < 64; i++) {
k = fftorder[i] / 4;
pre2[i].real = cos ((M_PI / 128) * (k - 0.25));
pre2[i].imag = sin ((M_PI / 128) * (k - 0.25));
}
for (i = 0; i < 32; i++) {
post2[i].real = cos ((M_PI / 128) * (i + 0.5));
post2[i].imag = sin ((M_PI / 128) * (i + 0.5));
}
{
int i, j, k;
/* Twiddle factors to turn IFFT into IMDCT */
for (i = 0; i < 128; i++) {
xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
}
for (i = 0; i < 128; i++) {
sseSinCos1c[2*i+0]= xcos1[i];
sseSinCos1c[2*i+1]= -xcos1[i];
sseSinCos1d[2*i+0]= xsin1[i];
sseSinCos1d[2*i+1]= xsin1[i];
}
/* More twiddle factors to turn IFFT into IMDCT */
for (i = 0; i < 64; i++) {
xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1));
xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1));
}
for (i = 0; i < 7; i++) {
j = 1 << i;
for (k = 0; k < j; k++) {
w[i][k].real = cos (-M_PI * k / j);
w[i][k].imag = sin (-M_PI * k / j);
}
}
for (i = 1; i < 7; i++) {
j = 1 << i;
for (k = 0; k < j; k+=2) {
sseW[i][4*k + 0] = w[i][k+0].real;
sseW[i][4*k + 1] = w[i][k+0].real;
sseW[i][4*k + 2] = w[i][k+1].real;
sseW[i][4*k + 3] = w[i][k+1].real;
sseW[i][4*k + 4] = -w[i][k+0].imag;
sseW[i][4*k + 5] = w[i][k+0].imag;
sseW[i][4*k + 6] = -w[i][k+1].imag;
sseW[i][4*k + 7] = w[i][k+1].imag;
//we multiply more or less uninitialized numbers so we need to use exactly 0.0
if(k==0)
{
// sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
}
if(2*k == j)
{
sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
// sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
}
}
}
for(i=0; i<128; i++)
{
sseWindow[2*i+0]= -imdct_window[2*i+0];
sseWindow[2*i+1]= imdct_window[2*i+1];
}
for(i=0; i<64; i++)
{
sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0];
sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1];
sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
}
}
{
//fprintf (stderr, "No accelerated IMDCT transform found\n");
ifft128 = ifft128_c;
ifft64 = ifft64_c;
}
#ifndef __GNUC__
if(mm_accel & FF_CPU_SSE)
a52_imdct_512 = imdct_do_512_sse;
else
#endif
a52_imdct_512 = a52_imdct_512_C;
a52_imdct_256 = a52_imdct_256_C;
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -