summaryrefslogtreecommitdiffstats
path: root/private/crt32/misc/alpha/smovem.s
diff options
context:
space:
mode:
Diffstat (limited to 'private/crt32/misc/alpha/smovem.s')
-rw-r--r--private/crt32/misc/alpha/smovem.s667
1 files changed, 667 insertions, 0 deletions
diff --git a/private/crt32/misc/alpha/smovem.s b/private/crt32/misc/alpha/smovem.s
new file mode 100644
index 000000000..11f859f20
--- /dev/null
+++ b/private/crt32/misc/alpha/smovem.s
@@ -0,0 +1,667 @@
+ #++
+ # Copyright 1991, 1994, Digital Equipment Corporation
+ #
+ # ots_movem(char *dstptr INOUT, long dstlen INOUT,
+ # char *srcptr, long srclen)
+ #
+ # Move dstlen characters from *srcptr to *dstptr, possibly overlapping
+ #
+ # Special conventions: No stack space, r16-r21 and r27-r28 ONLY,
+ # no linkage pointer required, r16 is INOUT and points to the address
+ # following the move, r17 is INOUT and has the remaining destination
+ # length following the move.
+ # (Warning: The auto-loader potentially takes some regs across
+ # the call if this is being used in a shared lib. environment.)
+ #
+ # This is a GEM support routine for moving (possibly overlapping) memory
+ # from one address to another. This is optimized for extremely high
+ # performance both for small blocks and large moves. In order to reduce
+ # overhead for small cases, they are retired as quickly as possible,
+ # more case analysis is reserved for cases which will do more. Note
+ # that while overlapping moves are supported, (unlike Sys V memcpy)
+ # routines), they are not quite as fast.
+ #
+ # Warning - This code is basically "expanded microcode". Since it is
+ # executed so frequently in many contexts, it has been extensively "hand-
+ # optimized"...
+ #
+ # Note that this routine and ots_move are basically similar in many
+ # respects (same basic code), so maintenance should be done both
+ # places. This routine is primarily provided for lower overhead (for
+ # short strings).
+ # [Except for the first few instructions, the recipe for creating OTS_MOVEM
+ # from OTS_MOVE is to change uses of R19->R21 and then R17->R19.]
+ #
+ # This version of OTS_MOVEM provides longword granularity.
+ #
+ # 015 1 Sep 1994 WBN Longword granularity version, based on
+ # OTS_MOVEM_ALPHA.M64 version 014 and
+ # OTS_MOVE_ALPHA_WNT.M64 version 015.
+ #--
+
+#include "ots_defs.hs"
+
+ # r16 = dst --> r16 = end
+ # r19 = dst_len --> r17 = remaining
+ # r18 = src
+ # r19 = src_len
+ # destroys r18-r21, r27-r28
+
+ .globl _OtsMoveMinimum
+ .ent _OtsMoveMinimum
+_OtsMoveMinimum:
+ .set noat
+ .set noreorder
+ .frame sp,0,r26
+ .prologue 0
+ subq r17, r19, r20 # Which length is larger?
+ cmovlt r20, r17, r19 # Min to r19
+ andnot r16, 3, r21 # LW-aligned dst pointer
+ subq r19, 4, r20 # Get length-4
+ beq r19, done # No memory accesses if length=0
+ ldq_u r28, (r18) # Load first QW of source
+ addq r19, r18, r27 # Point to end of source
+ subq r17, r19, r17 # Set remaining length for return
+ bge r20, geq4 # Go handle lengths >= 4
+ ldq_u r27, -1(r27) # Load last QW of source
+ and r16, 3, r16 # Get dst alignment within LW
+ ldl r19, (r21) # Load first LW of destination
+ addq r20, r16, r20 # Get alignment+length-4
+ extql r28, r18, r28 # Extract first bytes of source
+ bgt r20, double # Go handle LW crossing
+ extqh r27, r18, r27 # Extract last bytes of source
+ addq r20, 4, r20 # Get ending alignment in LW
+ or r27, r28, r28 # Combine halves of source
+ insql r28, r16, r28 # Position low part of source
+ mskql r19, r16, r18 # Keep low bytes of destination
+ mskql r28, r20, r28 # Trim off high bytes of source
+ mskqh r19, r20, r19 # Keep high bytes of destination
+ or r18, r28, r28 # Combine source with low dest
+ or r19, r28, r28 # Combine with high dest
+ stl r28, (r21) # Store to destination
+ addq r21, r20, r16 # Point to end of dest for return
+ ret r31, (r26)
+
+double: extqh r27, r18, r27 # Extract last bytes of source
+ ldl r18, 4(r21) # Load second LW of destination
+ mskql r19, r16, r19 # Keep low bytes of 1st dest LW
+ or r27, r28, r28 # Combine parts of source
+ insql r28, r16, r27 # Position start of source
+ addq r16, 4, r16 # Compute virtual start in LW
+ insqh r28, r16, r28 # Position end of source
+ addq r21, 4, r21 # Prepare to compute end address
+ mskqh r18, r20, r18 # Keep high bytes of 2nd dest LW
+ mskql r28, r20, r28 # Trim end of source to length
+ or r27, r19, r19 # Combine low source with 1st LW
+ stl r19, -4(r21)
+ or r28, r18, r18 # Combine high source with 2nd LW
+ stl r18, (r21)
+ addq r21, r20, r16 # Point to end of dest for return
+done: ret r31, (r26)
+
+ # Come here to move >= 4 bytes.
+ #
+ # r16-> dst
+ # r17 = remaining length for return
+ # r18-> src
+ # r19 = length
+ # r20 = len-4
+ # r21-> LW-aligned dst
+ # r27 = src+len
+ # r28 = first src QW
+
+geq4: subq r20, 4, r19 # At least 8 bytes to move?
+ subq r16, r27, r27 # Check if dst >= src+len
+ blt r19, lss8 # Move 4..7 bytes
+ subq r18, r16, r19 # Check if src >= dst
+ bge r27, ok1 # Forward OK if whole src precedes dst
+ blt r19, reverse # Go backwards if src < dst < src+len
+ok1: and r16, 7, r16
+ addq r16, r20, r27 # Alignment + length - 4
+ bne r16, part # Part of first QW to be skipped
+ subq r20, 4, r20 # At least 8 bytes to be stored?
+ beq r27, simple # Only low LW to be stored
+ and r18, 7, r27 # Is src address now aligned?
+ blt r20, shortq # Dst ends in first QW
+ subq r20, 32, r19 # At least 4 quadwords left to move?
+ beq r27, align # Go handle matching alignment
+
+ # Src alignment differs from dst alignment.
+ # r16 = dst alignment
+ # r17 = remaining length for return
+ # r18 = src-8 after 1st move
+ # r19
+ # r20 = initial length-8
+ # r21 = initial dst
+ # r27 = dst QW if dst wasn't aligned
+ # r28 = source QW
+
+misal: or r16, r21, r21 # Put alignment back with dst ptr ***
+ ldq_u r19, 8(r18) # Load same or next source QW
+ extql r28, r18, r28 # Get first part of source to store
+ addq r20, r16, r20 # Adjust length for partial move
+ mskql r27, r21, r27 # Trim destination for merge
+ extqh r19, r18, r16 # Get second part of source
+ subq r20, 24, r20 # At least 4 more quadwords?
+ or r28, r16, r28 # Combine pieces of source
+ mskqh r28, r21, r28 # Trim low junk off source
+ andnot r21, 7, r21 # Adjust dst for partial move
+ bge r20, unrol2 # Taken branch for long strings
+ addq r20, 16, r16 # Add back: how many whole QW's?
+ nop
+short2: and r20, 7, r20 # How many odd bytes?
+ blt r16, last # Skip if no more whole QW's
+ or r28, r27, r28 # Combine pieces
+ stq r28, (r21)
+ extql r19, r18, r27 # Get last part of prior src QW
+ ldq_u r19, 16(r18) # Load another src QW
+ addq r21, 8, r21 # Update dst
+ subq r16, 8, r16 # More whole QW's?
+ addq r18, 8, r18 # Update src
+ blt r16, lastx # Skip if no more whole QWs
+ extqh r19, r18, r28 # Get first part of this src QW
+ addq r18, 8, r18 # Update src again
+ or r28, r27, r28 # Combine pieces
+ stq r28, (r21)
+ extql r19, r18, r27 # Get last part of this src QW
+ ldq_u r19, 8(r18) # Load another src QW
+ addq r21, 8, r21 # Update dst
+lastx: extqh r19, r18, r28 # Get first part of this src QW
+last: addq r18, r20, r16 # Point to end-8 of src
+ beq r20, done_u # Skip if no odd bytes
+ or r28, r27, r28 # Combine parts of last whole QW
+ ldq_u r27, 7(r16) # Load final (maybe same) src QW
+ subq r20, 4, r16 # More than 4 bytes left?
+ stq r28, (r21) # Store last whole QW
+ extql r19, r18, r19 # Get last part of this src QW
+ extqh r27, r18, r27 # Get what we need from final src QW
+joinx: ldq r28, 8(r21) # Load last QW of destination
+ or r19, r27, r27 # Combine pieces of source
+ mskql r27, r20, r27 # Trim to length
+ mskqh r28, r20, r28 # Make room in destination
+ bgt r16, done_u # Go store a whole QW
+ addq r20, 8, r20 # Increment length for return
+ or r28, r27, r28 # Insert src into dst
+ stl r28, 8(r21) # Final LW
+ addq r21, r20, r16 # Point to end of dst for return
+ ret r31, (r26)
+
+ # Come here to move 4 thru 7 bytes.
+ #
+lss8: addq r18, r19, r27 # Recover src+len-8
+ and r16, 3, r16 # Dst alignment within LW
+ ldq_u r27, 7(r27) # Load last part of source
+ extql r28, r18, r28 # Extract first part of source
+ beq r16, lw # Handle LW-aligned dst
+ extqh r27, r18, r27 # Extract last part of source
+ ldl r18, (r21) # Load first LW of dst
+ addq r16, r20, r20 # align+len-4 of dst
+ or r28, r27, r28 # Complete source
+ mskql r28, r19, r28 # Trim source to length
+ mskql r18, r16, r18 # Make room in dst
+ insql r28, r16, r27 # Position src like dst
+ addq r16, r19, r19 # Align+len-8 of dst
+ or r27, r18, r18 # Merge
+ stl r18, (r21) # Store first LW of dst
+ extql r27, 4, r27 # Position next LW of src
+ blt r19, zz # Skip if not a whole LW
+ stl r27, 4(r21) # Store the whole LW
+ addq r21, 4, r21 # Adjust pointer
+ subq r20, 4, r20 # Adjust ending alignment
+ beq r19, donezz # Exit if done
+ insqh r28, r16, r27 # Position remainder of src
+zz: ldl r28, 4(r21) # Load last dst LW
+ mskqh r28, r20, r28 # Make room in dst
+ or r28, r27, r27 # Merge
+ stl r27, 4(r21) # Final store
+donezz: addq r21, r20, r16 # End address -4
+ addq r16, 4, r16
+ ret r31, (r26)
+
+lw: extqh r27, r18, r27 # Extract last part of source
+ addq r21, 4, r16 # Adjust for return value
+ beq r20, lwdone # Skip if exactly 4 bytes
+ ldl r19, 4(r21) # Load next dst LW
+ or r27, r28, r28 # Complete source
+ stl r28, (r21) # Store first LW
+ extql r28, 4, r28 # Position rest of source
+ mskqh r19, r20, r27 # Make room in dst
+ mskql r28, r20, r28 # Trim src
+ or r27, r28, r28 # Merge
+ stl r28, 4(r21)
+ addq r16, r20, r16 # Update return value
+ ret r31, (r26)
+
+lwdone: or r27, r28, r28 # Merge
+ stl r28, (r21)
+ ret r31, (r26)
+
+ # Move 4 bytes to an aligned LW.
+ #
+simple: ldq_u r27, 3(r18) # Load last QW of source
+ extql r28, r18, r28 # Position first QW
+ addq r21, 4, r16 # Point to end of dst for return
+ extqh r27, r18, r27 # Position last QW
+ or r28, r27, r28 # Merge
+ stl r28, (r21) # Store
+ ret r31, (r26)
+
+
+ # Dst is not aligned. Check whether first write is to a LW or a QW,
+ # and whether that finishes the move. Then see if src alignment
+ # matches, and read/rewrite the first dst quadword.
+ #
+ # r16 = dst alignment in QW
+ # r17 = remaining length for return
+ # r18-> src
+ # r19
+ # r20 = len-4
+ # r21-> LW-aligned dst
+ # r27 = QW_alignment + length - 4
+ # r28 = first src QW
+
+ #.align quad
+
+part: subq r27, 4, r19 # Does dst end in first QW?
+ ldq_u r27, (r21) # Load first dst QW
+ blt r19, shortu # Go handle short store
+ and r16, 4, r19 # Does it start in high LW?
+ subq r18, r16, r18 # Adjust src for this partial move
+ beq r19, quad # Whole QW to be touched
+ extql r28, r18, r19 # Position first part of source
+ ldq_u r28, 7(r18) # Get next (or same) src QW
+ mskql r27, r16, r27 # Trim destination for merge
+ addq r20, r16, r20 # Len + alignment...
+ extqh r28, r18, r28 # Position second part of source
+ subq r20, 4, r20 # Len+alignment-8 = remaining len
+ or r28, r19, r28 # Pieces of source
+ mskqh r28, r16, r19 # Trim junk preceding source
+ ldq_u r28, 7(r18) # Get src QW again **
+ or r27, r19, r19 # Combine other source piece
+ extql r19, 4, r19 # Get the high LW
+ stl r19, (r21) # Store just that
+
+ # Now at a QW boundary. Is there a QW left to store?
+ # Is the source QW aligned?
+
+ andnot r21, 7, r21 # Adjust dst pointer to next-8
+ subq r20, 8, r19 # Got a QW more?
+ and r18, 7, r27 # Src aligned?
+ blt r19, short3 # Too short
+ addq r21, 8, r21
+ subq r20, 8, r20
+ ldq_u r28, 8(r18)
+ addq r18, 8, r18
+ subq r20, 32, r19 # Prepare for unrolled loop
+ beq r27, align # Alignment matches
+ or r31, r31, r27
+ or r31, r31, r16
+ br r31, misal
+
+shortu: addq r18, r20, r20 # Point to end-4 of src
+ ldq_u r20, 3(r20) # Get last QW of source
+ extql r28, r18, r28 # Fetch first QW of source
+ extqh r20, r18, r20 # Fetch last QW of source
+ mskql r27, r16, r18 # Clear from start thru end of dst
+ mskqh r27, r19, r27 # Clear from 0 to end of dst
+ or r28, r20, r28 # Combine src pieces
+ insql r28, r16, r28 # Position src
+ or r27, r18, r27 # Combine dst pieces
+ mskql r28, r19, r28 # Trim src
+ addq r21, r19, r20 # Final pointer for return
+ or r28, r27, r28 # Merge src & dst
+ stq_u r28, (r21) # Store it
+ addq r20, 8, r16
+ ret r31, (r26)
+
+quad: and r18, 7, r19 # Is src address now aligned?
+ subq r20, 4, r20 # Get length-8
+ bne r19, misal # Go handle mismatched alignment
+ mskqh r28, r16, r28 # Keep desired part of source
+ addq r20, r16, r20 # Adjust count for this partial move
+ mskql r27, r16, r27 # Keep desired part of destination QW
+ subq r20, 32, r19 # At least 4 quadwords left to move?
+ or r27, r28, r28 # Merge source and destination
+
+ # Src alignment matches.
+ # r16
+ # r17 = remaining length for return
+ # r18 = next src pointer -8
+ # r19 = remaining length -32
+ # r20
+ # r21 = dst pointer
+ # r27
+ # r28 = dst quadword
+
+align: and r19, 24, r20 # How many after a multiple of 4?
+ bge r19, unrol1 # Taken branch for long strings
+ nop
+short1: and r19, 7, r19 # How many odd bytes?
+ beq r20, last28 # Skip if no more whole QWs after r28
+ ldq r27, 8(r18) # Load next QW
+ addq r18, 8, r18
+ stq r28, (r21) # Store prior QW
+ subq r20, 16, r20 # Map 8/16/24 to -8/0/8
+ addq r21, 8, r21
+ blt r20, last27 # Skip if no more after r27
+ ldq r28, 8(r18) # Load next QW
+ addq r18, 8, r18
+ stq r27, (r21) # Store prior QW
+ addq r21, 8, r21
+ nop
+ beq r20, last28
+ ldq r27, 8(r18) # Load next QW
+ addq r18, 8, r18
+ stq r28, (r21) # Store prior QW
+ addq r21, 8, r21
+last27: beq r19, done27 # Skip if no odd bytes
+ ldq r28, 8(r18) # Load one more src QW
+ ldq r18, 8(r21) # Load last destination QW
+ subq r19, 4, r16 # More than 4 bytes to store?
+ stq r27, (r21) # Store prior QW
+ mskql r28, r19, r27 # Trim source
+ mskqh r18, r19, r18 # Trim destination
+ ble r16, lastl # Go store just a LW
+lastq: addq r21, r19, r21 # End-8 of dst for return
+ or r27, r18, r27 # Merge src & dst
+done27: stq_u r27, 7(r21) # Store last destination QW
+ addq r21, 8, r16 # End of dst for return
+ ret r31, (r26)
+
+short3: addq r18, r20, r16 # Point to end-8 of src
+ beq r20, donexx # Completely done?
+ ldq_u r19, 7(r16) # Load final src QW
+ subq r20, 4, r16 # Got more than a LW?
+ beq r27, joinx # Don't include prior src if aligned
+ extql r28, r18, r27 # Last part of prior src QW
+ extqh r19, r18, r19 # First part of this src QW
+ br joinx
+
+donexx: addq r21, r20, r16
+ addq r16, 8, r16
+ ret r31, (r26)
+
+last28: beq r19, done28 # Skip if no odd bytes
+ ldq r27, 8(r18) # Load one more src QW
+ ldq r18, 8(r21) # Load last destination QW
+ subq r19, 4, r16 # More than 4 bytes to store?
+ stq r28, (r21) # Store prior QW
+ mskql r27, r19, r27 # Trim source
+ mskqh r18, r19, r18 # Trim destination
+ bgt r16, lastq # Go store a QW
+lastl: addq r19, 8, r19 # Increment length for return
+ or r27, r18, r27 # Merge src & dst
+ stl r27, 8(r21) # Store last destination LW
+ addq r21, r19, r16 # End of dst for return
+ ret r31, (r26)
+
+shortq: addq r18, r20, r16 # Point to end-8 of src
+ ldq r27, (r21) # Get dst QW
+ extql r28, r18, r28 # Position first src QW
+ ldq_u r19, 7(r16) # Get last QW of src
+ mskqh r27, r20, r27 # Mask dst QW
+ extqh r19, r18, r19 # Position last src QW
+ or r19, r28, r28 # Merge
+ mskql r28, r20, r28 # Trim src QW
+done_u: addq r21, r20, r21 # End-8 of dst for return
+ or r28, r27, r28 # Combine pieces
+done28: stq_u r28, 7(r21) # Store last destination QW
+ addq r21, 8, r16 # End of dst for return
+ ret r31, (r26)
+
+ # Unrolled loop for long moves with matching alignment within QW.
+ # Each iteration moves two cache blocks.
+ # We try to schedule the cache misses to avoid a double miss
+ # in EV4 pass 2.1 chips. If the source alignment within a cache
+ # block is exactly 3, alter it, since that case runs slower.
+ #
+ # R16
+ # R17 = remaining length for return
+ # R18 = src pointer
+ # R19 = remaining length (to load) - 32
+ # R20 = length & 24 (needed at return)
+ # R21 = dst pointer
+ # R27
+ # R28 = QW from 0(R18) to store at 0(R21), both on input and at return
+ #
+
+ #.align quad
+
+unrol1: ldq r27, 32(r18) # Cache miss here; later loads hit.
+ subq r19, 48, r16 # Six more quadwords?
+ and r18, 16, r20 # Starting in 2nd half of cache block?
+ blt r16, uent1 # If not 6 more, don't adjust.
+ ldq r16, 8(r18)
+ beq r20, utop1 # If in 1st half, don't adjust.
+ ldq r27, 48(r18) # Cache miss here
+ addq r18, 16, r18
+ stq r28, (r21) # Adjust by going ahead 1/2 block.
+ addq r21, 16, r21
+ ldq r28, (r18)
+ subq r19, 16, r19
+ stq r16, -8(r21)
+ nop
+ ldq r16, 8(r18)
+utop1: subq r19, 32, r19
+
+uloop1: ldq r20, 64(r18) # Cache miss here
+ stq r28, (r21)
+ ldq r28, 16(r18)
+ stq r16, 8(r21)
+ ldq r16, 24(r18)
+ addq r18, 64, r18
+ stq r28, 16(r21)
+ mov r20, r28
+ stq r16, 24(r21)
+ addq r21, 64, r21
+ ldq r20, -24(r18)
+ subq r19, 32, r19
+ blt r19, uexit1
+ ldq r16, 32(r18) # Cache miss here
+ stq r27, -32(r21)
+ ldq r27, -16(r18)
+ stq r20, -24(r21)
+ ldq r20, -8(r18)
+ stq r27, -16(r21)
+ mov r16, r27
+ stq r20, -8(r21)
+uent1: subq r19, 32, r19
+ ldq r16, 8(r18)
+ bge r19, uloop1
+
+ # finish last block of 4 quadwords
+ #
+ubot1: stq r28, (r21)
+ mov r27, r28 # Position last QW for return
+ ldq r27, 16(r18)
+ addq r18, 32, r18
+ stq r16, 8(r21)
+ addq r21, 32, r21
+uex1a: ldq r16, -8(r18)
+ and r19, 24, r20 # Recover count of remaining QW's
+ stq r27, -16(r21)
+ stq r16, -8(r21)
+ br r31, short1
+
+ nop
+uexit1: stq r27, -32(r21) # Here if exit from middle of loop
+ ldq r27, -16(r18)
+ stq r20, -24(r21)
+ br r31, uex1a # Join common exit sequence
+
+ #.align quad
+
+unrol2: ldq_u r16, 16(r18) # Load next src QW
+ extql r19, r18, r19 # Get last part of prior one
+ or r28, r27, r28 # Combine pieces
+ stq r28, (r21) # Store prior dst QW
+ subq r20, 24, r20 # Update loop counter
+ extqh r16, r18, r28 # Get first part of a src QW
+ ldq_u r27, 24(r18) # Load next src QW
+ extql r16, r18, r16 # Get last part of prior one
+ or r28, r19, r28 # Combine pieces
+ stq r28, 8(r21) # Store prior dst QW
+ addq r21, 24, r21 # Update dst pointer
+ extqh r27, r18, r28 # Get first part of a src QW
+ ldq_u r19, 32(r18) # Load next src QW
+ extql r27, r18, r27 # Get last part of prior one
+ or r28, r16, r28 # Combine pieces
+ stq r28, -8(r21) # Store prior dst QW
+ addq r18, 24, r18 # Update src pointer
+ extqh r19, r18, r28 # Get first part of a src QW
+ bge r20, unrol2 # Repeat as needed
+ addq r20, 16, r16 # How many whole quadwords left?
+ br r31, short2 # Go handle leftovers
+ nop
+
+ # Must move in reverse order because of overlap.
+ # r16 = dst address
+ # r17 = remaining length for return
+ # r18 = src address
+ # r19
+ # r20 = len-4 (>= 0)
+ # r21
+ # r27
+ # r28
+
+ # Not yet LW-granularity...
+
+reverse:
+ subq r20, 4, r20 # This code expects len-8
+ addq r20, r18, r18 # Point to end-8 of source
+ addq r20, r16, r19 # Point to end-8 of destination
+ and r19, 7, r21 # Is destination aligned?
+ ldq_u r28, 7(r18) # Get source QW
+ addq r19, 8, r16 # Point to end of dst for return
+ bne r21, rpart # Skip if partial write needed
+ and r18, 7, r27 # Is source aligned too?
+ beq r27, ralign # Skip if so
+ ldq_u r21, (r18) # Handle aligned dst, unaligned src
+ subq r20, 8, r20
+ extqh r28, r18, r28
+ extql r21, r18, r27
+ br r31, rwhole
+
+rmis: ldq_u r21, (r18) # Load same or preceding src QW
+ extqh r28, r18, r28 # Get last part of source to store
+ mskqh r27, r16, r27 # Keep high-address part of dst
+ extql r21, r18, r21
+ subq r20, 8, r20 # How many more whole QW's?
+ or r21, r28, r28
+ ldq_u r21, (r18) # Reload source QW
+ mskql r28, r16, r28 # Trim source to length
+rwhole: blt r20, rlast2 # Skip if no more whole QW's
+rloop2: or r28, r27, r28 # Combine pieces
+ stq r28, (r19)
+rent2: extqh r21, r18, r27
+ ldq_u r21, -8(r18)
+ subq r20, 8, r20
+ subq r19, 8, r19
+ extql r21, r18, r28
+ subq r18, 8, r18
+ bge r20, rloop2
+rlast2: and r20, 7, r20
+ beq r20, rdone2
+ or r28, r27, r28
+ subq r18, r20, r27
+ stq r28, (r19)
+rl2ent: subq r31, r20, r20
+ ldq_u r27, (r27)
+ extqh r21, r18, r21
+ ldq r28, -8(r19)
+ subq r19, 8, r19
+ extql r27, r18, r27
+ mskql r28, r20, r28
+ or r27, r21, r27
+ mskqh r27, r20, r27
+ and r20, 4, r21 # Ending in high LW?
+ bne r21, rdone3 # Only longword store at the end
+rdone2: or r28, r27, r28
+ stq r28, (r19)
+ ret r31, (r26)
+
+rdone3: or r28, r27, r28
+ extql r28, 4, r28
+ stl r28, 4(r19)
+ ret r31, (r26)
+
+rpart: ldq_u r27, 7(r19) # Get dst QW
+ subq r21, 8, r21 # Get negative of bytes not moved
+ subq r18, r21, r18 # From src-8, get src after partial
+ subq r20, r21, r20 # Adjust length for partial move
+ subq r19, r21, r19 # Adjust dst pointer
+ addq r21, 4, r21 # End alignment - 4
+ ble r21, r_lw # Only storing the low longword?
+ and r18, 7, r21 # Src alignment now matching dst?
+ bne r21, rmis # Go back if not
+ mskql r28, r16, r28 # Keep low addresses of src QW
+ mskqh r27, r16, r27 # Keep high address of dst QW
+ralign: subq r20, 8, r20 # How many more whole QW's?
+ or r27, r28, r28 # Combine
+ blt r20, rlast1 # Skip if this is the end
+rloop1: stq r28, (r19) # Store one QW
+rent1: subq r20, 8, r20 # Decrement length
+ ldq r28, -8(r18) # Load preceding QW
+ subq r19, 8, r19 # Decrement dst pointer
+ subq r18, 8, r18 # Decrement src pointer
+ bge r20, rloop1 # Repeat for each whole QW
+rlast1: and r20, 7, r20 # How many odd bytes?
+ beq r20, rdone # Skip if none
+ ldq r27, -8(r18) # Get another source QW
+ subq r31, r20, r20 # Get byte # to end at
+ stq r28, (r19)
+rl_ent: ldq r28, -8(r19)
+ subq r19, 8, r19 # Adjust dst pointer again
+ mskqh r27, r20, r27 # Keep top of src QW
+ and r20, 4, r21 # Ending in high LW?
+ mskql r28, r20, r28 # Keep bottom of dst QW
+ bne r21, rdone4 # Only longword store at the end
+ or r27, r28, r28 # Combine
+rdone: stq r28, (r19) # Store last QW
+ ret r31, (r26)
+
+rdone4: or r27, r28, r28 # Combine
+ extql r28, 4, r28 # Get high part
+ stl r28, 4(r19) # Store last LW
+ ret r31, (r26)
+
+r_lw: and r18, 7, r21 # Src alignment now matching dst?
+ bne r21, rmislw # Go back if not
+ mskql r28, r16, r28 # Keep low addresses of src QW
+ mskqh r27, r16, r27 # Keep high address of dst QW
+ subq r20, 8, r20 # How many more whole QW's?
+ or r27, r28, r28 # Combine
+ blt r20, rlast1_lw # Skip if this is the end
+ stl r28, (r19) # Store one QW
+ br r31, rent1
+
+rlast1_lw:
+ and r20, 7, r20 # How many odd bytes?
+ ldq r27, -8(r18) # Get another source QW
+ subq r31, r20, r20 # Get byte # to end at
+ stl r28, (r19)
+ br rl_ent
+
+rmislw: ldq_u r21, (r18) # Load same or preceding src QW
+ extqh r28, r18, r28 # Get last part of source to store
+ mskqh r27, r16, r27 # Keep high-address part of dst
+ extql r21, r18, r21
+ subq r20, 8, r20 # How many more whole QW's?
+ or r21, r28, r28
+ ldq_u r21, (r18) # Reload source QW
+ mskql r28, r16, r28 # Trim source to length
+ blt r20, rlast2_lw # Skip if no more whole QW's
+ or r28, r27, r28 # Combine pieces
+ stl r28, (r19)
+ br r31, rent2
+
+rlast2_lw:
+ and r20, 7, r20
+ or r28, r27, r28
+ subq r18, r20, r27
+ stl r28, (r19)
+ br r31, rl2ent
+
+ .set at
+ .set reorder
+ .end _OtsMove