/* ------------------------------------------------------------------ */
/* | Copyright Unpublished, MIPS Computer Systems, Inc.  All Rights | */
/* | Reserved.  This software contains proprietary and confidential | */
/* | information of MIPS and its suppliers.  Use, disclosure or     | */
/* | reproduction is prohibited without the prior express written   | */
/* | consent of MIPS.                                               | */
/* ------------------------------------------------------------------ */
/*  strcpy.s 1.2 */

/* This function is an assembly-code replacement for the libc function
 * strcpy.  It uses the MIPS special instructions "lwl", "lwr", "swl",
 * and "swr", which handle unaligned words.

 * The standard C version of this function is a 5-instruction loop,
 * working one byte at a time:

 * Copy string s2 to s1.  s1 must be large enough.
 * return s1
 *	char *strcpy(s1, s2)
 *	register char *s1, *s2;
 *	{
 *		register char *os1;
 *		os1 = s1;
 *		while (*s1++ = *s2++);
 *		return(os1);
 *	}

 * A better C version is 4 cycles/byte. Loop is unrolled once.
 * char *
 * strcpy(s1, s2)
 * register char *s1, *s2;
 * {
 * 	register char *os1 = s1;
 * 	while (1) {
 * 		register unsigned c;
 * 		c = s2[0];
 * 		s2 += 2;
 * 		s1[0] = c;
 * 		if (c == 0) break;
 * 		c = s2[1-2];
 * 		s1 += 2;
 * 		s1[1-2] = c;
 * 		if (c == 0) break;
 * 	}
 * 	return(os1);
 * }

 * This function starts with an unrolled loop, which uses 5
 * instructions per byte (including the store bytes at the end) for
 * the first few bytes.

 * After filling a word, the first word or portion of a word is saved
 * using a "swl" instruction. If the start of destination string is at
 * a word boundary, this leaves the result valid in the cache. Because
 * this replaces up to 4 store byte instructions, we are still near 3
 * instructions per byte, but there is only one write.
   
 * The inner loop moves 4 bytes in 16 cycles, an average of 4 cycles
 * per byte.  This is 1 cycle faster than the standard C code, the
 * same speed as the unrolled version, and it also leaves the result
 * valid in the cache.
   
 * Finally, when a zero byte is found, the end of the string is stored
 * using store byte instructions.  This adds one instruction per byte
 * for as much as three bytes, but elminates the up to four cycles of
 * overhead we counted before.
   
 * The end result is that this function is never slower than the C
 * function, is faster by up to 30% in instruction count, uses up to
 * 75% fewer writes, and leaves most of the result valid in the cache.
   
 * There are one caveat to consider: this function is written in
 * assembler code, and as such, cannot be merged using the U-code
 * loader. */

/* Craig Hansen - 3-September-86 */

#include <kxmips.h>

/* It turns out better to think of lwl/lwr and swl/swr as
   smaller-vs-bigger address rather than left-vs-right.
   Such a representation makes the code endian-independent. */

#define LWS lwr
#define LWB lwl
#define SWS swr
#define SWB swl

.text

LEAF_ENTRY(strcpy)
.set noreorder
	// a0/ destination
	// a1/ source
	move	v0, a0		# a copy of destination address is returned
	// start up first word
	// adjust pointers so that a0 points to next word
	// t7 = a1 adjusted by same amount minus one
	// t0,t1,t2,t3 are filled with 4 consecutive bytes
	// t4 is filled with the same 4 bytes in a single word
	lb	t0, 0(a1)
	ori	t5, a0, 3	# get an early start
	beq	t0, 0, $doch0
	sub	t6, t5, a0	# number of char in 1st word of dest - 1
	lb	t1, 1(a1)
	add	t7, a1, t6	# offset starting point for source string
	beq	t1, 0, $doch1
	nop
	lb	t2, 2(a1)
	nop
	beq	t2, 0, $doch2
	LWS	t4, 0(a1)	# safe: always in same word as 0(a1)
	lb	t3, 3(a1)
	LWB	t4, 3(a1)	# fill out word
	beq	t3, 0, $doch3
	SWS	t4, 0(a0)	# store entire or part word
	addi	a0, t5, 1-4	# adjust destination ptr

	// inner loop
1:	lb	t0, 1(t7)
	addi	t7, 4
	beq	t0, 0, $doch0
	addi	a0, 4
	lb	t1, 1+1-4(t7)
	nop
	beq	t1, 0, $doch1
	nop
	lb	t2, 2+1-4(t7)
	nop
	beq	t2, 0, $doch2
	LWS	t4, 0+1-4(t7)
	lb	t3, 3+1-4(t7)
	LWB	t4, 3+1-4(t7)
	bne	t3, 0, 1b
	sw	t4, 0(a0)
	j	ra
	nop

	// store four bytes using swl/swr
$doch3:	j	ra
	SWB	t4, 3(a0)
	// store up to three bytes, a byte at a time.
$doch2:	sb	t2, 2(a0)
$doch1:	sb	t1, 1(a0)
$doch0:	j	ra
	sb	t0, 0(a0)

.end strcpy