1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
|
#++
# Copyright 1991, 1994, Digital Equipment Corporation
#
# ots_zero(char *dstptr, long dstlen)
#
# Zero dstlen bytes of memory at *dstptr
#
# Special conventions: No stack space, r16-r17 and r27-r28 ONLY,
# no linkage pointer required.
# (Warning: The auto-loader potentially takes some regs across
# the call if this is being used in a shared lib. environment.)
#
# This is a GEM support routine for zeroing a region of memory. It is
# basically idential to BSD's bzero, though it has limited register
# convensions to allow it to work better with compiled code. (Note that
# this is just a stripped down version of ots_fill.)
#
# This is optimized for extremely high performance both for small and
# large blocks. In order to reduce overhead for small cases, they are
# retired as quickly as possible, more case analysis is reserved
# for cases which will do more.
#
# This version of OTS_ZERO provides longword granularity for Alpha.
#
# 012 30 Aug 1994 WBN Longword granularity version based on
# OTS_ZERO_ALPHA.M64 edit 011.
#--
#include "ots_defs.hs"
# r16 = dst
# r17 = len
# destroys r16-r17, r27-r28
.globl _OtsZero
.ent _OtsZero
_OtsZero:
.set noat
.set noreorder
.frame sp,0,r26
.prologue 0
beq r17, done # No memory refs if len=0
subq r17, 4, r28 # Length-4
and r16, 3, r27 # Dst alignment (0-3)
andnot r16, 3, r16 # LW aligned dst pointer
addq r27, r28, r17 # Alignment + length - 4
bge r28, geq4 # Lengths >= 4 may not need load
ldl r28, (r16) # Load first LW of dst
bgt r17, double # Skip if it crosses to next LW
addq r17, 4, r17 # Find endpoint within LW
mskql r28, r27, r27 # Clear from startpoint thru 7
mskqh r28, r17, r28 # Clear from 0 to endpoint
or r28, r27, r27 # Combine dest parts
stl r27, (r16)
ret r31, (r26)
double: mskql r28, r27, r28 # Clear from startpoint in first LW
ldl r27, 4(r16) # Load second LW of dst
stl r28, (r16)
mskqh r27, r17, r27 # Clear up to endpoint in second LW
stl r27, 4(r16)
ret r31, (r26)
# Come here if length to be zeroed is >= 4.
# r16-> dst aligned to LW
# r17 = alignment + length - 4
# r27 = dst alignment within LW
# r28 = length-4
#.align quad
geq4: and r16, 4, r28 # Which LW in QW to store first?
beq r17, simple # Go handle single aligned LW
bne r28, longs # Go use QW stores
quad: subq r17, 4, r17 # Does dest end in first QW?
blt r17, shortq # Ends within first QW
beq r27, wh_qw # Store a whole QW
ldq r28, (r16) # Load first QW of dest
mskql r28, r27, r27 # Clear from startpoint
wh_qw: stq r27, (r16) # Store first QW of dest
br r31, join # Go clear rest of string
simple: stl r31, (r16) # Single aligned LW
ret r31, (r26)
shortq: ldq r28, (r16) # Load QW of dest
mskql r28, r27, r27 # Clear from startpoint thru 7
mskqh r28, r17, r28 # Clear from 0 up to endpoint
or r28, r27, r27 # Merge
stq r27, (r16) # Store
ret r31, (r26)
longs: beq r27, wh_lw # Store a whole LW
ldl r28, (r16) # Load first LW of dest
mskql r28, r27, r27 # Clear from startpoint
wh_lw: stl r27, (r16) # Store first LW of dest
join: subq r17, 32, r17 # At least 4 more quadwords?
and r17, 24, r27 # How many after multiple of 4?
bge r17, unroll # Taken branch for long strings
short: and r17, 7, r17 # How many odd bytes?
beq r27, last # Skip if no more whole QWs
stq_u r31, 8(r16) # Clear one...
subq r27, 16, r27 # Map 8/16/24 to -8/0/8
addq r16, 8, r16 # Update dest pointer
blt r27, last # Skip if no more whole QWs
#stall
stq_u r31, 8(r16) # Clear two...
addq r16, 8, r16 # Update dest pointer
nop
beq r27, last # Skip if no more whole QWs
stq_u r31, 8(r16) # Clear three...
addq r16, 8, r16 # Update dest pointer
last: beq r17, done # Finished if no odd bytes
ldq_u r27, 8(r16) # Load last QW of dst
subq r17, 4, r28 # More than a LW left?
andnot r16, 7, r16 # Clean pointer for STL
mskqh r27, r17, r27 # Clear up to endpoint
bgt r28, lastq # Go store a QW
stl r27, 8(r16) # LW store for last piece
done: ret r31, (r26)
lastq: stq r27, 8(r16) # QW store for last piece
ret r31, (r26)
unroll: stq_u r31, 8(r16) # Store 4 QWs per iteration
stq_u r31, 16(r16)
stq_u r31, 24(r16)
subq r17, 32, r17 # Decrement remaining count
stq_u r31, 32(r16)
addq r16, 32, r16 # Update dest pointer
bge r17, unroll # Repeat until done
br r31, short # Then handle leftovers
.set at
.set reorder
.end _OtsZero
|