summaryrefslogblamecommitdiffstats
path: root/private/crt32/misc/alpha/szero.s
blob: c97c31df58cff937d70b5433d5f64f521b355ae0 (plain) (tree)








































































































































                                                                             
 #++
 #   Copyright 1991, 1994, Digital Equipment Corporation
 # 
 #      ots_zero(char *dstptr, long dstlen)
 # 
 #      Zero dstlen bytes of memory at *dstptr
 # 
 #      Special conventions: No stack space, r16-r17 and r27-r28 ONLY,
 #      no linkage pointer required.
 #       (Warning: The auto-loader potentially takes some regs across
 #       the call if this is being used in a shared lib. environment.)
 # 
 #   This is a GEM support routine for zeroing a region of memory.  It is
 #   basically idential to BSD's bzero, though it has limited register
 #   convensions to allow it to work better with compiled code.  (Note that
 #   this is just a stripped down version of ots_fill.)
 # 
 #   This is optimized for extremely high performance both for small and
 #   large blocks.  In order to reduce overhead for small cases, they are
 #   retired as quickly as possible, more case analysis is reserved
 #   for cases which will do more.
 #
 #   This version of OTS_ZERO provides longword granularity for Alpha.
 # 
 #   012          30 Aug 1994   WBN     Longword granularity version based on
 #                              OTS_ZERO_ALPHA.M64 edit 011.
 #--

#include        "ots_defs.hs"

        # r16 = dst
        # r17 = len
        # destroys r16-r17, r27-r28

        .globl  _OtsZero
        .ent    _OtsZero
_OtsZero:
        .set noat
        .set noreorder
        .frame  sp,0,r26
        .prologue       0
        beq     r17, done               # No memory refs if len=0
        subq    r17, 4, r28             # Length-4
        and     r16, 3, r27             # Dst alignment (0-3)
        andnot  r16, 3, r16             # LW aligned dst pointer
        addq    r27, r28, r17           # Alignment + length - 4
        bge     r28, geq4               # Lengths >= 4 may not need load
        ldl     r28, (r16)              # Load first LW of dst
        bgt     r17, double             # Skip if it crosses to next LW
        addq    r17, 4, r17             # Find endpoint within LW
        mskql   r28, r27, r27           # Clear from startpoint thru 7
        mskqh   r28, r17, r28           # Clear from 0 to endpoint
        or      r28, r27, r27           # Combine dest parts
        stl     r27, (r16)
        ret     r31, (r26)

double: mskql   r28, r27, r28           # Clear from startpoint in first LW
        ldl     r27, 4(r16)             # Load second LW of dst
        stl     r28, (r16)
        mskqh   r27, r17, r27           # Clear up to endpoint in second LW
        stl     r27, 4(r16)
        ret     r31, (r26)

 # Come here if length to be zeroed is >= 4.
 # r16-> dst aligned to LW
 # r17 = alignment + length - 4
 # r27 = dst alignment within LW
 # r28 = length-4

        #.align quad

geq4:   and     r16, 4, r28             # Which LW in QW to store first?
        beq     r17, simple             # Go handle single aligned LW
        bne     r28, longs              # Go use QW stores
quad:   subq    r17, 4, r17             # Does dest end in first QW?
        blt     r17, shortq             # Ends within first QW
        beq     r27, wh_qw              # Store a whole QW
        ldq     r28, (r16)              # Load first QW of dest
        mskql   r28, r27, r27           # Clear from startpoint
wh_qw:  stq     r27, (r16)              # Store first QW of dest
        br      r31, join               # Go clear rest of string

simple: stl     r31, (r16)              # Single aligned LW
        ret     r31, (r26)

shortq: ldq     r28, (r16)              # Load QW of dest
        mskql   r28, r27, r27           # Clear from startpoint thru 7
        mskqh   r28, r17, r28           # Clear from 0 up to endpoint
        or      r28, r27, r27           # Merge
        stq     r27, (r16)              # Store
        ret     r31, (r26)

longs:  beq     r27, wh_lw              # Store a whole LW
        ldl     r28, (r16)              # Load first LW of dest
        mskql   r28, r27, r27           # Clear from startpoint
wh_lw:  stl     r27, (r16)              # Store first LW of dest
join:   subq    r17, 32, r17            # At least 4 more quadwords?
        and     r17, 24, r27            # How many after multiple of 4?
        bge     r17, unroll             # Taken branch for long strings
short:  and     r17, 7, r17             # How many odd bytes?
        beq     r27, last               # Skip if no more whole QWs
        stq_u   r31, 8(r16)             # Clear one...
        subq    r27, 16, r27            # Map 8/16/24 to -8/0/8
        addq    r16, 8, r16             # Update dest pointer
        blt     r27, last               # Skip if no more whole QWs
        #stall
        stq_u   r31, 8(r16)             # Clear two...
        addq    r16, 8, r16             # Update dest pointer
        nop
        beq     r27, last               # Skip if no more whole QWs
        stq_u   r31, 8(r16)             # Clear three...
        addq    r16, 8, r16             # Update dest pointer
last:   beq     r17, done               # Finished if no odd bytes
        ldq_u   r27, 8(r16)             # Load last QW of dst
        subq    r17, 4, r28             # More than a LW left?
        andnot  r16, 7, r16             # Clean pointer for STL
        mskqh   r27, r17, r27           # Clear up to endpoint
        bgt     r28, lastq              # Go store a QW
        stl     r27, 8(r16)             # LW store for last piece
done:   ret     r31, (r26)

lastq:  stq     r27, 8(r16)             # QW store for last piece
        ret     r31, (r26)

unroll: stq_u   r31, 8(r16)             # Store 4 QWs per iteration
        stq_u   r31, 16(r16)
        stq_u   r31, 24(r16)
        subq    r17, 32, r17            # Decrement remaining count
        stq_u   r31, 32(r16)
        addq    r16, 32, r16            # Update dest pointer
        bge     r17, unroll             # Repeat until done
        br      r31, short              # Then handle leftovers


        .set at
        .set reorder
        .end    _OtsZero