SECTION .text align 16 global libecc_shiftwindow_xorassign:function (libecc_shiftwindow_xorassign_end - libecc_shiftwindow_xorassign) ; libecc_shiftwindow_xorassign(uint32_t* bitset1, uint32_t const* bitset2, int32_t lsb, int32_t msb, int32_t shift) ; ; Example (with digits of only 8 bits): ; bitset1 : [00001111][11111111][10000000] ; bitset2 : [00000011][11111111][00000000] ; shifted : [00000001][11111111][10000000] (line up least significant bit with bitset1) ; result : [00001110][00000000][00000000] (XOR with bitset1, written to bitset1) ; Inputs %define Mbitset1 [esp+n4+36] ; Pointer to the first digit of bitset1. %define Mbitset2 [esp+n4+40] ; Pointer to the first digit of bitset2. %define Mlsb [esp+n4+44] ; The index of the least significant bit of the window. %define Mmsb [esp+n4+48] ; The index of the most significant bit of the window. %define Mshift [esp+n4+52] ; The distance that the window needs to be shifted to the right. ; ; Register assignment: %define src esi ; Current source address. %define dst edi ; Current destination address. %define count edx ; Input digits counter. %define bitshift ecx ; Bit shift. %define digit ebx ; Current input digit. %define store eax ; Temporary storage for (previous) input digit. %define mask ebp ; Mask for first and last input digit. ; ; The right most digit that needs to be read is (lsb / 32) ; The left most digit that needs to be read is (msb / 32) ; The total number of digits that need to be read is ((msb / 32) - (lsb / 32)) + 1 ; The right bit shift is ((lsb % 32) - ((lsb - shift) % 32)) ; if that is negative, then we need a left shift of (((lsb - shift) % 32) - (lsb % 32)) ; ; Local variables. %define Mxdb [esp] ; 4 * ((lsb - shift) / 32), the offset in bytes to the first byte ; of the least significant digit of bitset1. %assign n 1 ; Number of local variables. %assign n4 4*n libecc_shiftwindow_xorassign: pushad sub esp, n4 ; abcdD mov eax, Mlsb ; 1 mov ebx, Mlsb ; 1 mov edi, Mlsb ; 1 sub eax, Mshift ; 2 sub ebx, Mshift ; 2 and edi, 31 ; 2 and eax, 0xffffffe0 ; 3 and ebx, 31 ; 3 mov ecx, edi ; 1 3 ; ecx = (lsb % 32) sar eax, 3 ; 4 mov edx, Mlsb ; 1 ; edx = lsb sub edi, ebx ; 4 4 ; edi = ((lsb % 32) - ((lsb - shift) % 32)) mov Mxdb, eax ; 5 js shiftwindow_left jmp shiftwindow_right ; Shift to the left ; ; The first digit to read is (bitset2) + 4 * (lsb / 32) ; The first digit to write is (bitset1) + 4 * ((lsb - shift) / 32) ; The mask for the first digit is (0xffffffff << (lsb % 32)) ; The mask for the last digit is ~(0xfffffffe << (msb % 32)) shiftwindow_left: ; Calculate bit shift, but keep it in edi. neg edi ; edi = (((lsb - shift) % 32) - (lsb % 32)) ; Calculate first mask (put into eax). mov eax, 0xffffffff shl eax, cl ; eax = (0xffffffff << (lsb % 32)) ; Calculate *src*, address of first digit. and edx, 0xffffffe0 ; edx invalidated mov src, Mbitset2 sar edx, 3 add src, edx ; esi = src = (bitset2) + 4 * (lsb / 32) ; Calculate *digit*, first (masked) digit. mov digit, [src] ; ebx = digit and digit, eax ; Calculate *mask*. mov ecx, Mmsb ; ecx = msb mov eax, 0xfffffffe ; eax invalidated mov mask, 0xffffffff shl eax, cl ; shl automagically does the % 32 xor mask, eax ; ebp = mask = ~(0xfffffffe << (msb % 32)) ; Calculate *count*, the total number of input digits minus one. sar ecx, 5 ; ecx = msb / 32 mov eax, Mlsb mov count, ecx sar eax, 5 ; eax = (lsb / 32) sub count, eax ; edx = count = ((msb / 32) - (lsb / 32)) ; Put the bit shift in *bitshift*. mov bitshift, edi ; ecx = bitshift ; Calculate *dst*, address of first digit to write to. mov dst, Mbitset1 add dst, Mxdb ; edi = dst = bitset1 + 4 * ((lsb - shift) / 32) ; Is the first digit also the last digit? test count, count mov store, count ; Fastest way to assure that store contains zero in case we jump to .lastdigit. jz .lastdigit ; Process the first digit. mov store, digit ; eax = store = digit shl digit, cl xor [dst], digit .nextdigit: ; Retrieve the next digit. add src, 4 add dst, 4 mov digit, [src] ; Is this the last digit? dec count jz .lastdigit ; Process the next digit. shld digit, store, cl mov store, [src] xor [dst], digit jmp .nextdigit .lastdigit: ; Process the last digit. and digit, mask shld digit, store, cl xor [dst], digit ; There is possibly one more output digit. mov eax, [src] ; store invalidated test ecx, ecx ; Set zero flag if bitshift is zero, shld does not affect the flags then. shld edx, eax, cl ; Use the fact that edx = count = 0 jz .exit ; It is possible that [dst+4] is not accessible in this case. xor [dst+4], edx .exit: add esp, n4 popad ret ; Shift to the right ; ; The first digit to read is (bitset2) + 4 * (lsb / 32) + 4 * ((msb / 32) - (lsb / 32)) ; The first digit to write is (bitset1) + 4 * ((lsb - shift) / 32) + 4 * ((msb / 32) - (lsb / 32)) ; The mask for the first digit is ~(0xfffffffe << (msb % 32)) ; The mask for the last digit is (0xffffffff << (lsb % 32)) ALIGN 32 shiftwindow_right: ; ecx = (lsb % 32) ; edx = lsb ; edi = ((lsb % 32) - ((lsb - shift) % 32)) ; Calculate *mask*. mov mask, 0xffffffff xor eax, eax ; eax = 0 shl mask, cl ; ebp = mask = (0xffffffff << (lsb % 32)) ; Calculate first mask (put into eax). mov ebx, 0xfffffffe mov ecx, Mmsb ; ecx = msb dec eax ; eax = 0xffffffff shl ebx, cl ; shl automagically does the % 32 xor eax, ebx ; eax = ~(0xfffffffe << (msb % 32)) ; Calculate *src*, address of first digit. and ecx, 0xffffffe0 ; ecx = 32 * (msb / 32) and edx, 0xffffffe0 ; edx = 32 * (lsb / 32) mov src, Mbitset2 sub ecx, edx ; ecx = 32 * ((msb / 32) - (lsb / 32)) mov ebx, edx ; ebx = 32 * (lsb / 32) sar ecx, 3 ; ecx = 4 * ((msb / 32) - (lsb / 32)) sar ebx, 3 ; ebx = 4 * (lsb / 32) add src, ecx add src, ebx ; esi = src = (bitset2) + 4 * (lsb / 32) + 4 * ((msb / 32) - (lsb / 32)) ; Calculate *digit*, first (masked) digit. mov digit, [src] ; ebx = digit mov edx, ecx ; edx = 4 * ((msb / 32) - (lsb / 32)) and digit, eax ; Put the bit shift in *bitshift*. mov bitshift, edi ; ecx = bitshift ; Calculate *dst*, address of first digit to write to. mov dst, Mbitset1 add dst, Mxdb add dst, edx ; edi = dst = (bitset1) + 4 * ((lsb - shift) / 32) + 4 * ((msb / 32) - (lsb / 32)) ; Calculate *count*, the total number of input digits minus one. shr count, 2 ; edx = count = ((msb / 32) - (lsb / 32)) ; Is the first digit also the last digit? mov store, count ; Fastest way to assure that store contains zero in case we jump to .lastdigit. jz .lastdigit ; Process the first digit. mov store, digit ; eax = store = digit sub src, 4 ; Decrementing src already here gives better performance. shr digit, cl jz .seconddigit ; If the result is zero then we might not be able to access [dst] ; If cl is zero then the shr did not affect the flags and ZF is still unset, ; so we will not jump. In that case however, [dst] will always be accessible. xor [dst], digit ; Retrieve the next digit. sub dst, 4 mov digit, [src] ; Is this the last digit? dec count jz .lastdigit .nextdigit: ; Process the next digit. shrd digit, store, cl mov store, [src] xor [dst], digit ; Retrieve the next digit. sub src, 4 .seconddigit: sub dst, 4 dec count ; Already decrement count here. mov digit, [src] jnz .nextdigit .lastdigit: ; Process the last digit. and digit, mask shrd digit, store, cl xor [dst], digit ; There is not another possibly output digit because ; the right most non-zero digits have been aligned. add esp, n4 popad ret libecc_shiftwindow_xorassign_end: SECTION .text align 16 global libecc_shift_xorassign:function (libecc_shift_xorassign_end - libecc_shift_xorassign) ; libecc_shift_xorassign(uint32_t* bitset1, uint32_t const* bitset2, int32_t lsb, int32_t msb, int32_t shift) ; %define Mbitset1 [esp+36] ; Pointer to the first digit of bitset1. %define Mbitset2 [esp+40] ; Pointer to the first digit of bitset2. %define Mlsb [esp+44] ; The index of the least significant bit of bitset2. %define Mmsb [esp+48] ; The index of the most significant bit of bitset2. %define Mshift [esp+52] ; The distance that bitset2 needs to be shifted to the right. libecc_shift_xorassign: pushad ; abcdDS mov eax, Mlsb ; 1 mov ecx, Mshift ; 1 mov edx, Mmsb ; 1 add eax, Mshift ; 2 mov ebx, Mlsb ; 1 and ecx, 31 ; 2 add edx, Mshift ; 2 sar eax, 5 ; 3 add ebx, ecx ; 23 sar edx, 5 ; 3 mov edi, Mbitset1 ; 1 and ebx, 0xffffffe0 ; 3 sub edx, eax ; 4 4 mov esi, Mbitset2 ; 1 sar ebx, 3 ; 4 sal eax, 2 ; 5 add esi, ebx ; 5 2 add edi, eax ; 6 2 mov ebx, [esi + 4 * edx] .loop mov eax, [esi + 4 * edx - 4] shld ebx, eax, cl xor [edi + 4 * edx], ebx mov ebx, eax dec edx jns .loop popad ret libecc_shift_xorassign_end: