section .bss mem1 resd 1 mem2 resd 1 mem3 resd 1 mem5 resd 1 %define INCC ebp-4 %define SIZ2 ebp-8 %define M2 ebp-12 %define J ebp-16 %define N ebp-20 %define M1 ebp-24 %define SIZ8 ebp-28 section .text ; Construct timf2_shi from back transforms in fft1_backbuffer ; Calculate timf2_pwr_int fft1back_mmx_finish: push eax push ebx push ecx push edx push esi push edi push ebp mov ebx,[fft1_interleave_points] mov edi,[timf2_pa] add edi,edi add edi,[timf2_shi] mov esi,[fft1_backbuffer] mov eax,[timf2_input_block] add eax,eax sub eax,8 cmp dword [rx_channels],1 jz near finish_one shr eax,2 mov ebp,[timf2_pa] shr ebp,1 add ebp,[timf2_pwr_int] or ebx,ebx jnz not_two_0 ; ******************************************** ; No window used two_0_loop: movq mm1,[4*eax+esi-8] movq [4*eax+edi-8],mm1 pmaddwd mm1,mm1 psrld mm1,1 movq mm2,mm1 psllq mm1,32 paddd mm1,mm2 movq [eax+ebp-6],mm1 movq mm1,[4*eax+esi] movq [4*eax+edi],mm1 sub eax,4 jns two_0_loop ; We have destroyed one data point in timf2_pwr_int. Put it back! movsx eax,word [-16+edi] imul eax,eax movsx ebx,word [-14+edi] imul ebx,ebx add eax,ebx shr eax,1 movsx ecx,word [-12+edi] imul ecx,ecx movsx ebx,word [-10+edi] imul ebx,ebx add ecx,ebx shr ecx,1 add eax,ecx mov [-4+ebp],eax jmp near finish_x ; ********************************************** ; sin squared window not_two_0: add ebx,ebx cmp ebx,[fft1_size] jnz near not_two_2 two_2a_loop: movq mm1,[4*eax+esi-8] paddsw mm1,[4*eax+edi-8] movq [4*eax+edi-8],mm1 pmaddwd mm1,mm1 psrad mm1,1 movq mm2,mm1 psllq mm1,32 paddd mm1,mm2 movq [eax+ebp-6],mm1 movq mm1,[4*eax+esi] paddsw mm1,[4*eax+edi] movq [4*eax+edi],mm1 sub eax,4 jns two_2a_loop ; We have destroyed one data point in timf2_pwr_int. Put it back! movsx eax,word [-16+edi] imul eax,eax movsx ebx,word [-14+edi] imul ebx,ebx add eax,ebx shr eax,1 movsx ecx,word [-12+edi] imul ecx,ecx movsx ebx,word [-10+edi] imul ebx,ebx add ecx,ebx shr ecx,1 add eax,ecx mov [-4+ebp],eax mov eax,[timf2_input_block] mov edi,[timf2_pa] add edi,eax and edi,[timf2_mask] add edi,edi add edi,[timf2_shi] mov esi,[fft1_backbuffer] add eax,eax add esi,eax sub eax,8 two_2b_loop: movq mm1,[eax+esi] movq [eax+edi],mm1 sub eax,8 jns two_2b_loop jmp near finish_x ; ***************************************** ; window N=1 or N>2 not_two_2: mov ebp,[fft1_inverted_mmxwin] mov eax,[fft1_size] sal eax,2 mov [ebp-4],eax mov eax,[fft1_interleave_points] sal eax,2 mov [ebp-8],eax mov eax,[fft1_interleave_points] sal eax,2 mov edx,[timf2_mask] mov ecx,[timf2_pa] mov edi,[timf2_shi] shr ecx,1 mov ebx,[timf2_pwr_int] lea esi,[esi+2*eax] sub ecx,4 shr edx,1 genwin_two_loop1: movq mm0,[ebp+eax] movq mm1,mm0 movq mm2,mm0 movq mm3,mm0 pmulhw mm0,[esi] add ecx,4 paddsw mm0,mm0 ; shift by additions so we saturate properly!! pmulhw mm1,[esi+8] paddsw mm0,mm0 and ecx,edx paddsw mm1,mm1 pmullw mm2,[esi] paddsw mm1,mm1 add eax,8 psrlw mm2,14 pmullw mm3,[esi+8] add esi,16 psrlw mm3,14 paddsw mm0,mm2 paddsw mm1,mm3 movq [edi+4*ecx],mm0 movq [edi+4*ecx+8],mm1 pmaddwd mm0,mm0 psrad mm0,1 movq mm2,mm0 psrlq mm0,32 paddd mm0,mm2 movq [ebx+ecx],mm0 cmp eax,[ebp-4] jnz genwin_two_loop1 align 8 genwin_two_loop2: movq mm0,[ebp+eax] movq mm1,mm0 movq mm2,mm0 movq mm3,mm0 pmulhw mm0,[esi] add ecx,4 paddsw mm0,mm0 ; shift by additions so we saturate!! pmulhw mm1,[esi+8] paddsw mm0,mm0 and ecx,edx paddsw mm1,mm1 pmullw mm2,[esi] paddsw mm1,mm1 sub eax,8 psrlw mm2,14 pmullw mm3,[esi+8] add esi,16 psrlw mm3,14 paddsw mm0,mm2 paddsw mm1,mm3 movq [edi+4*ecx],mm0 movq [edi+4*ecx+8],mm1 pmaddwd mm0,mm0 psrad mm0,1 movq mm2,mm0 psrlq mm0,32 paddd mm0,mm2 movq [ebx+ecx],mm0 cmp eax,[ebp-8] jns genwin_two_loop2 finish_x: emms pop ebp pop edi pop esi pop edx pop ecx pop ebx pop eax ret finish_one: shr eax,1 mov ebp,[timf2_pa] add ebp,[timf2_pwr_int] or ebx,ebx jnz not_one_0 ; No window used one_0_loop: movq mm1,[2*eax+esi] movq [2*eax+edi],mm1 pmaddwd mm1,mm1 psllq mm1,32 movq [eax+ebp-4],mm1 sub eax,4 jns one_0_loop ; We have destroyed one data point in timf2_pwr_int. Put it back! movsx eax,word [-8+edi] imul eax,eax movsx ebx,word [-6+edi] imul ebx,ebx add eax,ebx mov [-4+ebp],eax jmp short finish_x not_one_0: add ebx,ebx cmp ebx,[fft1_size] jnz not_one_2 ; Sine squared window one_2a_loop: movq mm1,[2*eax+esi] paddsw mm1,[2*eax+edi] movq [2*eax+edi],mm1 pmaddwd mm1,mm1 psllq mm1,32 movq [eax+ebp-4],mm1 sub eax,4 jns one_2a_loop ; We have destroyed one data point in timf2_pwr_int. Put it back! movsx eax,word [-8+edi] imul eax,eax movsx ebx,word [-6+edi] imul ebx,ebx add eax,ebx mov [-4+ebp],eax mov eax,[timf2_input_block] mov edi,[timf2_pa] add edi,eax and edi,[timf2_mask] add edi,edi add edi,[timf2_shi] mov esi,[fft1_backbuffer] add eax,eax add esi,eax sub eax,8 one_2b_loop: movq mm1,[eax+esi] movq [eax+edi],mm1 sub eax,8 jns one_2b_loop jmp near finish_x ; Window N==1 or N>2 not_one_2: mov ebp,[fft1_inverted_mmxwin] mov eax,[fft1_size] sal eax,2 mov [ebp-4],eax mov eax,[fft1_interleave_points] sal eax,2 mov [ebp-8],eax mov eax,[fft1_interleave_points] sal eax,2 mov ebx,[timf2_pwr_int] mov edx,[timf2_mask] mov ecx,[timf2_pa] mov edi,[timf2_shi] add esi,eax sub ecx,4 genwin_one_loop1: movq mm0,[ebp+eax] movq mm2,mm0 pmulhw mm0,[esi] add ecx,4 pmullw mm2,[esi] and ecx,edx paddsw mm0,mm0 add eax,8 paddsw mm0,mm0 psrlw mm2,14 add esi,8 paddsw mm0,mm2 movq [edi+2*ecx],mm0 pmaddwd mm0,mm0 movq [ebx+ecx],mm0 cmp eax,[ebp-4] jnz genwin_one_loop1 genwin_one_loop2: movq mm0,[ebp+eax] movq mm2,mm0 pmulhw mm0,[esi] add ecx,4 pmullw mm2,[esi] and ecx,edx paddsw mm0,mm0 sub eax,8 paddsw mm0,mm0 psrlw mm2,14 add esi,8 paddsw mm0,mm2 movq [edi+2*ecx],mm0 pmaddwd mm0,mm0 movq [ebx+ecx],mm0 cmp eax,[ebp-8] jns genwin_one_loop2 jmp near finish_x mmx_fft1back_one: push eax push ebx push ecx push edx push esi push edi push ebp mov ebp,[fft1_backbuffer] mov eax,[fft1_size] mov esi,[fft1_back_scramble] mov edi,[fft1_split_shi] mov ebx,0ffffH add eax,eax add eax,esi mov [edi-16],ebx ;[edi-14]= 1 0 1 0 mov [edi-12],ebx mov [edi-8],ebx mov [edi-4],eax mov eax,[esi] mov ebx,eax and eax,0ffffH shr ebx,16 mov ecx,[esi+4] mov edx,ecx and ecx,0ffffH shr edx,16 ; The two first trivial loops are combined and ; run together with the unscrambling fftback_one_1: add esi,8 ; t1=fft1_split_shi[ia ]+fft1_split_shi[ib ]; ; t2=fft1_split_shi[ia+1]+fft1_split_shi[ib+1]; ; r1=fft1_split_shi[ia+2]+fft1_split_shi[ib+2]; ; r2=fft1_split_shi[ia+3]+fft1_split_shi[ib+3]; ; t5=fft1_split_shi[ia ]-fft1_split_shi[ib ]; ; t7=fft1_split_shi[ia+1]-fft1_split_shi[ib+1]; ; r5=fft1_split_shi[ia+2]-fft1_split_shi[ib+2]; ; r7=fft1_split_shi[ia+3]-fft1_split_shi[ib+3]; movq mm0,[edi+eax*8] movq mm1,[edi+ecx*8] movq mm2,mm0 paddsw mm0,[edi+ebx*8] ;mm0=r2 r1 t2 t1 movq mm3,mm1 psubsw mm2,[edi+ebx*8] ;mm2=r7 r5 t7 t5 movq mm4,mm0 ;mm4=r2 r1 t2 t1 ; t3=fft1_split_shi[ic ]+fft1_split_shi[id ]; ; t4=fft1_split_shi[ic+1]+fft1_split_shi[id+1]; ; r3=fft1_split_shi[ic+2]+fft1_split_shi[id+2]; ; r4=fft1_split_shi[ic+3]+fft1_split_shi[id+3]; ; t10=fft1_split_shi[ic ]-fft1_split_shi[id ]; ; t6= fft1_split_shi[ic+1]-fft1_split_shi[id+1]; ; r10=fft1_split_shi[ic+2]-fft1_split_shi[id+2]; ; r6= fft1_split_shi[ic+3]-fft1_split_shi[id+3]; paddsw mm1,[edi+edx*8] ;mm1=r4 r3 t4 t3 psubsw mm3,[edi+edx*8] ;mm3=r6 r10 t6 t10 mov eax,[esi] mov ecx,[esi+4] ; fft1_tmp[k ]=t1+t3; ; fft1_tmp[k+1]=t2+t4; ; fft1_tmp[k+2]=r1+r3; ; fft1_tmp[k+3]=r2+r4; ; fft1_tmp[k+8]=t1-t3; ; fft1_tmp[k+9]=t2-t4; ; fft1_tmp[k+10]=r1-r3; ; fft1_tmp[k+11]=r2-r4; ;mm0=r2 r1 t2 t1 paddsw mm0,mm1 ;mm0=r2+r4 r1+r3 t2+t4 t1+t3 ; t11=t5-t6 ; t9=t7+t10 ; r11=r5-r6 ; r9=r7+r10 ; t12=t5+t6 ; t8=t7-t10 ; r12=r5+r6 ; r8=r7-r10 ;mm2=r7 r5 t7 t5 ;mm3=r6 r10 t6 t10 movq mm5,mm3 ;mm5=r6 r10 t6 t10 psllq mm3,16 ;mm3=r10 t6 t10 0 ;mm4=r2 r1 t2 t1 psubsw mm4,mm1 ;mm4=r2-r4 r1-r3 t2-t4 t1-t3 psrlq mm5,16 ;mm5= 0 r6 r10 t6 ;[edi-14]= 1 0 1 0 pand mm3,[edi-14] ;mm3=r10 0 t10 0 pand mm5,[edi-12] ;mm5= 0 r6 0 t6 mov ebx,eax movq mm6,mm2 ;mm6=r7 r5 t7 t5 psubsw mm3,mm5 ;mm3=r10 -r6 t10 -t6 mov edx,ecx movq [ebp],mm0 ; fft1_tmp[k+4]=t12; ; fft1_tmp[k+5]=t8; ; fft1_tmp[k+6]=r12; ; fft1_tmp[k+7]=r8; ; fft1_tmp[k+12]=t11; ; fft1_tmp[k+13]=t9; ; fft1_tmp[k+14]=r11; ; fft1_tmp[k+15]=r9; psubsw mm2,mm3 ;mm2=r8 r12 t8 t12 paddsw mm6,mm3 ;mm6=r9 r11 t9 t11 and ecx,0ffffH and eax,0ffffH shr ebx,16 movq [ebp+8],mm2 movq [ebp+24],mm6 shr edx,16 movq [ebp+16],mm4 add ebp,32 cmp esi,[edi-4] jl near fftback_one_1 ; ; Loops are run pairwise to save memory access. ; This also makes fft fit well to the mmx architecture. ; Use reserved memory (8*dwords) before fft1_mmxcosin as scratch area ; The first locations of this array are used in every iteration and these ; variables will be in the cash. ;m1=4; ;m2=16; ;inc=fft1_size/16; ;for(n=2; n