summaryrefslogtreecommitdiffstats
path: root/private/crt32/misc/alpha/sfill.s
blob: 35814b32185c3855ed28e3f170f5ad16790a3785 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
 #++
 #   Copyright 1991, 1994, Digital Equipment Corporation
 # 
 #      ots_fill(char *dstptr, long dstlen, unsigned char fill)
 # 
 #      Fill dstlen bytes of memory at *dstptr with "fill"
 # 
 #       Special conventions: No stack space, r16-r19 and r27-r28 ONLY,
 #      no linkage pointer required.
 #      (Warning: The auto-loader potentially takes some regs across
 #      the call if this is being used in a shared lib. environment.)
 # 
 #   This is a GEM support routine for filling memory with a specified value,
 #   basically identical to the System V routine memset, with the 2nd two
 #   parameters reversed.  This is optimized for extremely high performance
 #   both for small blocks (string padding) and large blocks (memory fill).
 #   In order to reduce overhead for small cases, they are retired as quickly
 #   as possible, more case analysis is reserved for cases which will do
 #   more.
 #
 #   This version of OTS_FILL provides longword granularity for Alpha.
 # 
 #   011          30 Aug 1994   WBN     Longword granularity version based on
 #                              OTS_FILL_ALPHA.M64 edit 010.
 #--

#include        "ots_defs.hs"

        # r16 = dst
        # r17 = len
        # r18 = fill byte
        # destroys r16-r19, r27-r28

        .globl  _OtsFill
        .ent    _OtsFill
_OtsFill:
        .set noat
        .set noreorder
        .frame  sp,0,r26
        .prologue       0

        sll     r18, 8, r19             # Start propagating byte to quadword
        beq     r17, done               # No memory refs if len=0
        subq    r17, 4, r28             # Length-4
        or      r19, r18, r18           # Fill in bytes 0-1
        sll     r18, 16, r19
        and     r16, 3, r27             # Dst alignment (0-3)
        or      r19, r18, r18           # Fill in bytes 0-3
        andnot  r16, 3, r16             # LW aligned dst pointer
        addq    r27, r28, r17           # Alignment + length - 4
        bge     r28, geq4               # Lengths >= 4 may not need load
        ldl     r28, (r16)              # Load first LW of dst
        bgt     r17, double             # Skip if it crosses to next LW
        addq    r17, 4, r17             # Find endpoint within LW
        xor     r28, r18, r28           # Pre-flip all fill bits in dest
        mskql   r28, r27, r27           # Clear from startpoint thru 7
        mskqh   r28, r17, r28           # Clear from 0 to endpoint
        xor     r27, r18, r27           # Combine fill with masked dest
        xor     r28, r27, r27           # Result is fill in center part only
        stl     r27, (r16)
        ret     r31, (r26)

double: mskqh   r18, r27, r19           # Discard fill preceding startpoint
        mskql   r28, r27, r28           # Clear from startpoint in first LW
        ldl     r27, 4(r16)             # Load second LW of dst
        mskql   r18, r17, r18           # Discard fill following endpoint
        or      r28, r19, r28           # Insert fill in first LW
        stl     r28, (r16)
        mskqh   r27, r17, r27           # Clear up to endpoint in second LW
        or      r27, r18, r27           # Insert fill in second LW
        stl     r27, 4(r16)
        ret     r31, (r26)

 # Come here if length to be zeroed is >= 4.
 # r16-> dst aligned to LW
 # r17 = alignment + length - 4
 # r18 = fill in bytes 0-3
 # r27 = dst alignment within LW
 # r28 = length-4

        #.align quad

geq4:   and     r16, 4, r28             # Which LW in QW to store first?
        beq     r17, simple             # Go handle single aligned LW
        sll     r18, 32, r19
        bne     r28, longs              # Go use QW stores
quad:   subq    r17, 4, r17             # Does dest end in first QW?
        or      r18, r19, r18           # Fill in bytes 0-7
        blt     r17, shortq             # Ends within first QW
        mskqh   r18, r27, r28           # Clear initial bytes of fill
        beq     r27, wh_qw              # Store a whole QW
        ldq     r19, (r16)              # Load first QW of dest
        mskql   r19, r27, r19           # Clear from startpoint
        or      r19, r28, r28           # Combine first QW with fill
wh_qw:  stq     r28, (r16)              # Store first QW of dest
        br      r31, join               # Go fill rest of string

simple: stl     r18, (r16)              # Single aligned LW
        ret     r31, (r26)

shortq: ldq     r28, (r16)              # Load QW of dest
        xor     r28, r18, r28           # Pre-flip all fill bits in dest
        mskql   r28, r27, r27           # Clear from startpoint thru 7
        mskqh   r28, r17, r28           # Clear from 0 up to endpoint
        xor     r27, r18, r27           # Combine fill with masked dest
        xor     r28, r27, r27           # Result is fill in center part only
        stq     r27, (r16)              # Store
        ret     r31, (r26)

longs:  mskqh   r18, r27, r28           # Clear initial bytes of LW fill
        or      r18, r19, r18           # Fill in bytes 0-7
        beq     r27, wh_lw              # Store a whole LW
        ldl     r19, (r16)              # Load first LW of dest
        mskql   r19, r27, r19           # Clear from startpoint
        or      r19, r28, r28           # Combine first LW with fill
wh_lw:  stl     r28, (r16)              # Store first LW of dest
join:   subq    r17, 32, r17            # At least 4 more quadwords?
        and     r17, 24, r27            # How many after multiple of 4?
        bge     r17, unroll             # Taken branch for long strings
short:  and     r17, 7, r17             # How many odd bytes?
        beq     r27, last               # Skip if no more whole QWs
        stq_u   r18, 8(r16)             # Clear one...
        subq    r27, 16, r27            # Map 8/16/24 to -8/0/8
        addq    r16, 8, r16             # Update dest pointer
        blt     r27, last               # Skip if no more whole QWs
        #stall
        stq_u   r18, 8(r16)             # Clear two...
        addq    r16, 8, r16             # Update dest pointer
        nop
        beq     r27, last               # Skip if no more whole QWs
        stq_u   r18, 8(r16)             # Clear three...
        addq    r16, 8, r16             # Update dest pointer
last:   beq     r17, done               # Finished if no odd bytes
        ldq_u   r27, 8(r16)             # Load last QW of dest
        subq    r17, 4, r28             # More than a LW left?
        andnot  r16, 7, r16             # Clean pointer for STL
        mskql   r18, r17, r18           # Discard unneeded fill bytes
        #stall
        mskqh   r27, r17, r27           # Clear up to endpoint in last QW
        #stall
        or      r27, r18, r27           # Combine fill with last QW
        bgt     r28, lastq              # Go store a QW
        stl     r27, 8(r16)             # LW store for last piece
done:   ret     r31, (r26)

lastq:  stq     r27, 8(r16)             # QW store for last piece
        ret     r31, (r26)


unroll: stq_u   r18, 8(r16)             # Store 4 QWs per iteration
        stq_u   r18, 16(r16)
        stq_u   r18, 24(r16)
        subq    r17, 32, r17            # Decrement remaining count
        stq_u   r18, 32(r16)
        addq    r16, 32, r16            # Update dest pointer
        bge     r17, unroll             # repeat until done
        br      r31, short              # Then handle leftovers

        .set at
        .set reorder
        .end    _OtsFill