private/crt32/string/mips/memsetm.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

/* --------------------------------------------------- */
/* | Copyright (c) 1986 MIPS Computer Systems, Inc.  | */
/* | All Rights Reserved.                            | */
/* --------------------------------------------------- */
/* $Revision: 1.3 $ */

/*
 * char * 
 * memset(s, c, n)
 * 	register char * s;
 * 	register c, n;
 * {
 * 	register char * p = s;
 * 
 * 	while (--n >= 0)
 * 		*s++ = c;
 * 
 * 	return (p);
 * }
 */

/*
 * Copyright 1986 by MIPS Computer Systems, Inc.
 */

#include <kxmips.h>

#define	NBPW	4

/*
 * memset(dst, c, bcount)
 * set block of memory with blanks
 *
 * Calculating MINSET, assuming 10% cache-miss on non-loop code:
 * Overhead =~ 18 instructions => 28 (30) cycles
 * Byte set =~ 12 (24) cycles/word for 08M44 (08V11)
 * Word set =~ 3 (5) cycles/word for 08M44 (08V11)
 * If I-cache-miss nears 0, MINSET ==> 4 bytes; otherwise, times are:
 * breakeven (MEM) = 28 / (12 - 3) =~ 3 words
 * breakeven (VME) = 30 / (24 - 5)  =~ 1.5 words
 * Since the overhead is pessimistic (worst-case alignment), and many calls
 * will be for well-aligned data, and since Word-set at least leaves
 * the set in the cache, we shade these values (6-12) down to 8 bytes
 */
#define	MINSET	8

/* It turns out better to think of lwl/lwr and swl/swr as
   smaller-vs-bigger address rather than left-vs-right.
   Such a representation makes the code endian-independent. */

#define LWS lwr
#define LWB lwl
#define SWS swr
#define SWB swl

LEAF_ENTRY(memset)
	move	v0,a0			# return first argument; BDSLOT
	blt	a2,MINSET,byteset
	subu	v1,zero,a0		# number of bytes til aligned; BDSLOT
	beq	a1,$0,1f		# make memset(s, 0, n) faster
	sll	t0,a1,8
	or	a1,t0
	sll	t0,a1,16
	or	a1,t0
1:	and	v1,NBPW-1
	subu	a2,v1			# adjust count; BDSLOT
	beq	v1,zero,blkset		# already aligned
	SWS	a1,0(a0)
	addu	a0,v1

/*
 * set 8 byte, aligned block (no point in unrolling further,
 * since maximum write rate in M/500 is two cycles/word write)
 */
blkset:
	and	t0,a2,NBPW+NBPW-1	# count after by-8-byte loop done
	subu	a3,a2,t0		# total in 8 byte chunks; BDSLOT
	beq	a2,t0,wordset		# less than 8 bytes to set
	addu	a3,a0			# dst endpoint
1:	addu	a0,NBPW+NBPW
	sw	a1,-NBPW-NBPW(a0)
	sw	a1,-NBPW(a0)
	bne	a0,a3,1b
	move	a2,t0			# set end-of loop count

/*
 * do a word (if required) this is not a loop since loop above
 * guarantees that at most one word must be written here.
 */
wordset:
	and	t0,a2,NBPW		# count after by-word non-loop done
	subu	a2,t0			# adjust count; BDSLOT
	beq	t0,zero,byteset		# less than word to set
	sw	a1,0(a0)
	addu	a0,NBPW

byteset:
	addu	a3,a2,a0		# dst endpoint; BDSLOT
	ble	a2,zero,setdone
1:	addu	a0,1
	sb	a1,-1(a0)
	bne	a0,a3,1b
setdone:
	j	ra
.end memset