// $Header: /home/harrison/c/tcgmsg/ipcv4.0/RCS/memcpy.s,v 1.1 91/12/06 17:26:46 harrison Exp Locker: harrison $ // // RJH // C entry is same as standard library routine memcpy // // char *Memcpy (s1, s2, n) // char *s1, *s2; // int n; // // Memcpy() copies n characters from memory area s2 to s1. It // returns s1. // Standard library routine achieves about 3.8 Mbyte/s. // This does 38.2 Mbyte/s for 8 byte aligned input and output // 21.8 Mbyte/s for 4 .... // 6.3 Mbyte/s for unaligned data // The theoretical peak on the FX2800 is 80/2=40Mb/s for data // in the shared cache. // // FORTRAN entry is // // subroutine memcpy(a, b, n) // .text .globl _Memcpy // Fortran name .globl _memcpy_ // C name .align 16 // // FORTRAN entry ... r18 is passed by reference ... load it in // _memcpy_: ld.l 0(r18), r18 // // C entry // _Memcpy: mov r16, r19 // save r19 in return register adds -1, r0, r20 // store -1 in r20 // or r19, r17, r22 // or addresses together and 7, r22, r0 // bc aligned8 // skip to 8 byte aligned code and 3, r22, r0 // bc aligned4 // skip to 4 byte aligned code br aligned1 // skip to 1 byte aligned code nop // // code for eight byte alignment ... four way unrolled doubles (32 bytes) // 38.2 Mbyte/s = full speed if input is cachable // aligned8: shr 5, r18, r21 // r21 = r18/32 shl 5, r21, r22 subs r18, r22, r18 // r18 = remainder adds -1, r21, r21// bla does 0,...,r21-1 bc aligned4 // skip if r21 < 1 adds -8, r19, r19 // prepare for autoinc bla r20, r21, loop8a adds -8, r17, r17 // prepare for autoinc loop8a: fld.d 8(r17)++, f8 // get 8 bytes fld.d 8(r17)++, f10 // get 8 bytes fld.d 8(r17)++, f12 // get 8 bytes fld.d 8(r17)++, f14 // get 8 bytes fst.d f8, 8(r19)++ // store 8 bytes fst.d f10, 8(r19)++ // store 8 bytes fst.d f12, 8(r19)++ // store 8 bytes bla r20, r21, loop8a// decrement and branch fst.d f14, 8(r19)++ // store 8 bytes // adds 8, r19, r19 adds 8, r17, r17 // undo autoinc offsets and fall thru // // code for 4 byte aligned ... 4 way unrolled integer copy (16 bytes) // 21.8 Mbytes/s = about half speed if input is cachable // aligned4: shr 4, r18, r21 // r21 = r18/16 shl 4, r21, r22 subs r18, r22, r18 // r18 = remainder adds -1, r21, r21// bla does 0,...,r21-1 bc aligned1 // skip if r21 < 1 bla r20, r21, loop4a nop loop4a: ld.l 0(r17), r22 // get 4 bytes ld.l 4(r17), r23 // get 4 bytes ld.l 8(r17), r24 // get 4 bytes ld.l 12(r17), r25 // get 4 bytes adds 16, r17, r17 // increment address st.l r22, 0(r19) // store 4 bytes st.l r23, 4(r19) // store 4 bytes st.l r24, 8(r19) // store 4 bytes st.l r25, 12(r19) // store 4 bytes bla r20, r21, loop4a// decrement and branch adds 16, r19, r19 // increment address in delay slot // // 2 byte aligned ... slower than single bytes ... deleted // // code for general alignment ... 4 way unrolled byte copy // 6.3 Mbytes/s if input is cachable // aligned1: shr 2, r18, r21 // r21 = r18/4 shl 2, r21, r22 subs r18, r22, r18 // r18 = remainder adds -1, r21, r21// bla does 0,...,r21-1 bc done1a // skip if r21 < 1 bla r20, r21, loop1a nop loop1a: ld.b 0(r17), r22 // get byte ld.b 1(r17), r23 // get byte ld.b 2(r17), r24 // get byte ld.b 3(r17), r25 // get byte adds 4, r17, r17 // increment address st.b r22, 0(r19) // store byte st.b r23, 1(r19) // store byte st.b r24, 2(r19) // store byte st.b r25, 3(r19) // store byte bla r20, r21, loop1a adds 4, r19, r19 // increment address in delay slot // // tidy up loop for single byte copy // done1a: adds -1, r18, r18 // bla does 0,...,r18-1 bc done // skip if r18<1 bla r20, r18, loop1b nop loop1b: ld.b 0(r17), r22 // get byte adds 1, r17, r17 // increment address st.b r22, 0(r19) // store byte bla r20, r18, loop1b // decrement and branch adds 1, r19, r19 // increment address in delay slot // done: bri r1 nop