diff options
Diffstat (limited to 'private/crt32/string/mips/strcpym.s')
-rw-r--r-- | private/crt32/string/mips/strcpym.s | 148 |
1 files changed, 148 insertions, 0 deletions
diff --git a/private/crt32/string/mips/strcpym.s b/private/crt32/string/mips/strcpym.s new file mode 100644 index 000000000..4f5bc416a --- /dev/null +++ b/private/crt32/string/mips/strcpym.s @@ -0,0 +1,148 @@ +/* ------------------------------------------------------------------ */ +/* | Copyright Unpublished, MIPS Computer Systems, Inc. All Rights | */ +/* | Reserved. This software contains proprietary and confidential | */ +/* | information of MIPS and its suppliers. Use, disclosure or | */ +/* | reproduction is prohibited without the prior express written | */ +/* | consent of MIPS. | */ +/* ------------------------------------------------------------------ */ +/* strcpy.s 1.2 */ + +/* This function is an assembly-code replacement for the libc function + * strcpy. It uses the MIPS special instructions "lwl", "lwr", "swl", + * and "swr", which handle unaligned words. + + * The standard C version of this function is a 5-instruction loop, + * working one byte at a time: + + * Copy string s2 to s1. s1 must be large enough. + * return s1 + * char *strcpy(s1, s2) + * register char *s1, *s2; + * { + * register char *os1; + * os1 = s1; + * while (*s1++ = *s2++); + * return(os1); + * } + + * A better C version is 4 cycles/byte. Loop is unrolled once. + * char * + * strcpy(s1, s2) + * register char *s1, *s2; + * { + * register char *os1 = s1; + * while (1) { + * register unsigned c; + * c = s2[0]; + * s2 += 2; + * s1[0] = c; + * if (c == 0) break; + * c = s2[1-2]; + * s1 += 2; + * s1[1-2] = c; + * if (c == 0) break; + * } + * return(os1); + * } + + * This function starts with an unrolled loop, which uses 5 + * instructions per byte (including the store bytes at the end) for + * the first few bytes. + + * After filling a word, the first word or portion of a word is saved + * using a "swl" instruction. If the start of destination string is at + * a word boundary, this leaves the result valid in the cache. Because + * this replaces up to 4 store byte instructions, we are still near 3 + * instructions per byte, but there is only one write. + + * The inner loop moves 4 bytes in 16 cycles, an average of 4 cycles + * per byte. This is 1 cycle faster than the standard C code, the + * same speed as the unrolled version, and it also leaves the result + * valid in the cache. + + * Finally, when a zero byte is found, the end of the string is stored + * using store byte instructions. This adds one instruction per byte + * for as much as three bytes, but elminates the up to four cycles of + * overhead we counted before. + + * The end result is that this function is never slower than the C + * function, is faster by up to 30% in instruction count, uses up to + * 75% fewer writes, and leaves most of the result valid in the cache. + + * There are one caveat to consider: this function is written in + * assembler code, and as such, cannot be merged using the U-code + * loader. */ + +/* Craig Hansen - 3-September-86 */ + +#include <kxmips.h> + +/* It turns out better to think of lwl/lwr and swl/swr as + smaller-vs-bigger address rather than left-vs-right. + Such a representation makes the code endian-independent. */ + +#define LWS lwr +#define LWB lwl +#define SWS swr +#define SWB swl + +.text + +LEAF_ENTRY(strcpy) +.set noreorder + // a0/ destination + // a1/ source + move v0, a0 # a copy of destination address is returned + // start up first word + // adjust pointers so that a0 points to next word + // t7 = a1 adjusted by same amount minus one + // t0,t1,t2,t3 are filled with 4 consecutive bytes + // t4 is filled with the same 4 bytes in a single word + lb t0, 0(a1) + ori t5, a0, 3 # get an early start + beq t0, 0, $doch0 + sub t6, t5, a0 # number of char in 1st word of dest - 1 + lb t1, 1(a1) + add t7, a1, t6 # offset starting point for source string + beq t1, 0, $doch1 + nop + lb t2, 2(a1) + nop + beq t2, 0, $doch2 + LWS t4, 0(a1) # safe: always in same word as 0(a1) + lb t3, 3(a1) + LWB t4, 3(a1) # fill out word + beq t3, 0, $doch3 + SWS t4, 0(a0) # store entire or part word + addi a0, t5, 1-4 # adjust destination ptr + + // inner loop +1: lb t0, 1(t7) + addi t7, 4 + beq t0, 0, $doch0 + addi a0, 4 + lb t1, 1+1-4(t7) + nop + beq t1, 0, $doch1 + nop + lb t2, 2+1-4(t7) + nop + beq t2, 0, $doch2 + LWS t4, 0+1-4(t7) + lb t3, 3+1-4(t7) + LWB t4, 3+1-4(t7) + bne t3, 0, 1b + sw t4, 0(a0) + j ra + nop + + // store four bytes using swl/swr +$doch3: j ra + SWB t4, 3(a0) + // store up to three bytes, a byte at a time. +$doch2: sb t2, 2(a0) +$doch1: sb t1, 1(a0) +$doch0: j ra + sb t0, 0(a0) + +.end strcpy |