summaryrefslogtreecommitdiffstats
path: root/private/crt32/string/mips/strcpym.s
diff options
context:
space:
mode:
Diffstat (limited to 'private/crt32/string/mips/strcpym.s')
-rw-r--r--private/crt32/string/mips/strcpym.s148
1 files changed, 148 insertions, 0 deletions
diff --git a/private/crt32/string/mips/strcpym.s b/private/crt32/string/mips/strcpym.s
new file mode 100644
index 000000000..4f5bc416a
--- /dev/null
+++ b/private/crt32/string/mips/strcpym.s
@@ -0,0 +1,148 @@
+/* ------------------------------------------------------------------ */
+/* | Copyright Unpublished, MIPS Computer Systems, Inc. All Rights | */
+/* | Reserved. This software contains proprietary and confidential | */
+/* | information of MIPS and its suppliers. Use, disclosure or | */
+/* | reproduction is prohibited without the prior express written | */
+/* | consent of MIPS. | */
+/* ------------------------------------------------------------------ */
+/* strcpy.s 1.2 */
+
+/* This function is an assembly-code replacement for the libc function
+ * strcpy. It uses the MIPS special instructions "lwl", "lwr", "swl",
+ * and "swr", which handle unaligned words.
+
+ * The standard C version of this function is a 5-instruction loop,
+ * working one byte at a time:
+
+ * Copy string s2 to s1. s1 must be large enough.
+ * return s1
+ * char *strcpy(s1, s2)
+ * register char *s1, *s2;
+ * {
+ * register char *os1;
+ * os1 = s1;
+ * while (*s1++ = *s2++);
+ * return(os1);
+ * }
+
+ * A better C version is 4 cycles/byte. Loop is unrolled once.
+ * char *
+ * strcpy(s1, s2)
+ * register char *s1, *s2;
+ * {
+ * register char *os1 = s1;
+ * while (1) {
+ * register unsigned c;
+ * c = s2[0];
+ * s2 += 2;
+ * s1[0] = c;
+ * if (c == 0) break;
+ * c = s2[1-2];
+ * s1 += 2;
+ * s1[1-2] = c;
+ * if (c == 0) break;
+ * }
+ * return(os1);
+ * }
+
+ * This function starts with an unrolled loop, which uses 5
+ * instructions per byte (including the store bytes at the end) for
+ * the first few bytes.
+
+ * After filling a word, the first word or portion of a word is saved
+ * using a "swl" instruction. If the start of destination string is at
+ * a word boundary, this leaves the result valid in the cache. Because
+ * this replaces up to 4 store byte instructions, we are still near 3
+ * instructions per byte, but there is only one write.
+
+ * The inner loop moves 4 bytes in 16 cycles, an average of 4 cycles
+ * per byte. This is 1 cycle faster than the standard C code, the
+ * same speed as the unrolled version, and it also leaves the result
+ * valid in the cache.
+
+ * Finally, when a zero byte is found, the end of the string is stored
+ * using store byte instructions. This adds one instruction per byte
+ * for as much as three bytes, but elminates the up to four cycles of
+ * overhead we counted before.
+
+ * The end result is that this function is never slower than the C
+ * function, is faster by up to 30% in instruction count, uses up to
+ * 75% fewer writes, and leaves most of the result valid in the cache.
+
+ * There are one caveat to consider: this function is written in
+ * assembler code, and as such, cannot be merged using the U-code
+ * loader. */
+
+/* Craig Hansen - 3-September-86 */
+
+#include <kxmips.h>
+
+/* It turns out better to think of lwl/lwr and swl/swr as
+ smaller-vs-bigger address rather than left-vs-right.
+ Such a representation makes the code endian-independent. */
+
+#define LWS lwr
+#define LWB lwl
+#define SWS swr
+#define SWB swl
+
+.text
+
+LEAF_ENTRY(strcpy)
+.set noreorder
+ // a0/ destination
+ // a1/ source
+ move v0, a0 # a copy of destination address is returned
+ // start up first word
+ // adjust pointers so that a0 points to next word
+ // t7 = a1 adjusted by same amount minus one
+ // t0,t1,t2,t3 are filled with 4 consecutive bytes
+ // t4 is filled with the same 4 bytes in a single word
+ lb t0, 0(a1)
+ ori t5, a0, 3 # get an early start
+ beq t0, 0, $doch0
+ sub t6, t5, a0 # number of char in 1st word of dest - 1
+ lb t1, 1(a1)
+ add t7, a1, t6 # offset starting point for source string
+ beq t1, 0, $doch1
+ nop
+ lb t2, 2(a1)
+ nop
+ beq t2, 0, $doch2
+ LWS t4, 0(a1) # safe: always in same word as 0(a1)
+ lb t3, 3(a1)
+ LWB t4, 3(a1) # fill out word
+ beq t3, 0, $doch3
+ SWS t4, 0(a0) # store entire or part word
+ addi a0, t5, 1-4 # adjust destination ptr
+
+ // inner loop
+1: lb t0, 1(t7)
+ addi t7, 4
+ beq t0, 0, $doch0
+ addi a0, 4
+ lb t1, 1+1-4(t7)
+ nop
+ beq t1, 0, $doch1
+ nop
+ lb t2, 2+1-4(t7)
+ nop
+ beq t2, 0, $doch2
+ LWS t4, 0+1-4(t7)
+ lb t3, 3+1-4(t7)
+ LWB t4, 3+1-4(t7)
+ bne t3, 0, 1b
+ sw t4, 0(a0)
+ j ra
+ nop
+
+ // store four bytes using swl/swr
+$doch3: j ra
+ SWB t4, 3(a0)
+ // store up to three bytes, a byte at a time.
+$doch2: sb t2, 2(a0)
+$doch1: sb t1, 1(a0)
+$doch0: j ra
+ sb t0, 0(a0)
+
+.end strcpy