diff options
Diffstat (limited to 'private/crt32/misc/alpha/smovem.s')
-rw-r--r-- | private/crt32/misc/alpha/smovem.s | 667 |
1 files changed, 667 insertions, 0 deletions
diff --git a/private/crt32/misc/alpha/smovem.s b/private/crt32/misc/alpha/smovem.s new file mode 100644 index 000000000..11f859f20 --- /dev/null +++ b/private/crt32/misc/alpha/smovem.s @@ -0,0 +1,667 @@ + #++ + # Copyright 1991, 1994, Digital Equipment Corporation + # + # ots_movem(char *dstptr INOUT, long dstlen INOUT, + # char *srcptr, long srclen) + # + # Move dstlen characters from *srcptr to *dstptr, possibly overlapping + # + # Special conventions: No stack space, r16-r21 and r27-r28 ONLY, + # no linkage pointer required, r16 is INOUT and points to the address + # following the move, r17 is INOUT and has the remaining destination + # length following the move. + # (Warning: The auto-loader potentially takes some regs across + # the call if this is being used in a shared lib. environment.) + # + # This is a GEM support routine for moving (possibly overlapping) memory + # from one address to another. This is optimized for extremely high + # performance both for small blocks and large moves. In order to reduce + # overhead for small cases, they are retired as quickly as possible, + # more case analysis is reserved for cases which will do more. Note + # that while overlapping moves are supported, (unlike Sys V memcpy) + # routines), they are not quite as fast. + # + # Warning - This code is basically "expanded microcode". Since it is + # executed so frequently in many contexts, it has been extensively "hand- + # optimized"... + # + # Note that this routine and ots_move are basically similar in many + # respects (same basic code), so maintenance should be done both + # places. This routine is primarily provided for lower overhead (for + # short strings). + # [Except for the first few instructions, the recipe for creating OTS_MOVEM + # from OTS_MOVE is to change uses of R19->R21 and then R17->R19.] + # + # This version of OTS_MOVEM provides longword granularity. + # + # 015 1 Sep 1994 WBN Longword granularity version, based on + # OTS_MOVEM_ALPHA.M64 version 014 and + # OTS_MOVE_ALPHA_WNT.M64 version 015. + #-- + +#include "ots_defs.hs" + + # r16 = dst --> r16 = end + # r19 = dst_len --> r17 = remaining + # r18 = src + # r19 = src_len + # destroys r18-r21, r27-r28 + + .globl _OtsMoveMinimum + .ent _OtsMoveMinimum +_OtsMoveMinimum: + .set noat + .set noreorder + .frame sp,0,r26 + .prologue 0 + subq r17, r19, r20 # Which length is larger? + cmovlt r20, r17, r19 # Min to r19 + andnot r16, 3, r21 # LW-aligned dst pointer + subq r19, 4, r20 # Get length-4 + beq r19, done # No memory accesses if length=0 + ldq_u r28, (r18) # Load first QW of source + addq r19, r18, r27 # Point to end of source + subq r17, r19, r17 # Set remaining length for return + bge r20, geq4 # Go handle lengths >= 4 + ldq_u r27, -1(r27) # Load last QW of source + and r16, 3, r16 # Get dst alignment within LW + ldl r19, (r21) # Load first LW of destination + addq r20, r16, r20 # Get alignment+length-4 + extql r28, r18, r28 # Extract first bytes of source + bgt r20, double # Go handle LW crossing + extqh r27, r18, r27 # Extract last bytes of source + addq r20, 4, r20 # Get ending alignment in LW + or r27, r28, r28 # Combine halves of source + insql r28, r16, r28 # Position low part of source + mskql r19, r16, r18 # Keep low bytes of destination + mskql r28, r20, r28 # Trim off high bytes of source + mskqh r19, r20, r19 # Keep high bytes of destination + or r18, r28, r28 # Combine source with low dest + or r19, r28, r28 # Combine with high dest + stl r28, (r21) # Store to destination + addq r21, r20, r16 # Point to end of dest for return + ret r31, (r26) + +double: extqh r27, r18, r27 # Extract last bytes of source + ldl r18, 4(r21) # Load second LW of destination + mskql r19, r16, r19 # Keep low bytes of 1st dest LW + or r27, r28, r28 # Combine parts of source + insql r28, r16, r27 # Position start of source + addq r16, 4, r16 # Compute virtual start in LW + insqh r28, r16, r28 # Position end of source + addq r21, 4, r21 # Prepare to compute end address + mskqh r18, r20, r18 # Keep high bytes of 2nd dest LW + mskql r28, r20, r28 # Trim end of source to length + or r27, r19, r19 # Combine low source with 1st LW + stl r19, -4(r21) + or r28, r18, r18 # Combine high source with 2nd LW + stl r18, (r21) + addq r21, r20, r16 # Point to end of dest for return +done: ret r31, (r26) + + # Come here to move >= 4 bytes. + # + # r16-> dst + # r17 = remaining length for return + # r18-> src + # r19 = length + # r20 = len-4 + # r21-> LW-aligned dst + # r27 = src+len + # r28 = first src QW + +geq4: subq r20, 4, r19 # At least 8 bytes to move? + subq r16, r27, r27 # Check if dst >= src+len + blt r19, lss8 # Move 4..7 bytes + subq r18, r16, r19 # Check if src >= dst + bge r27, ok1 # Forward OK if whole src precedes dst + blt r19, reverse # Go backwards if src < dst < src+len +ok1: and r16, 7, r16 + addq r16, r20, r27 # Alignment + length - 4 + bne r16, part # Part of first QW to be skipped + subq r20, 4, r20 # At least 8 bytes to be stored? + beq r27, simple # Only low LW to be stored + and r18, 7, r27 # Is src address now aligned? + blt r20, shortq # Dst ends in first QW + subq r20, 32, r19 # At least 4 quadwords left to move? + beq r27, align # Go handle matching alignment + + # Src alignment differs from dst alignment. + # r16 = dst alignment + # r17 = remaining length for return + # r18 = src-8 after 1st move + # r19 + # r20 = initial length-8 + # r21 = initial dst + # r27 = dst QW if dst wasn't aligned + # r28 = source QW + +misal: or r16, r21, r21 # Put alignment back with dst ptr *** + ldq_u r19, 8(r18) # Load same or next source QW + extql r28, r18, r28 # Get first part of source to store + addq r20, r16, r20 # Adjust length for partial move + mskql r27, r21, r27 # Trim destination for merge + extqh r19, r18, r16 # Get second part of source + subq r20, 24, r20 # At least 4 more quadwords? + or r28, r16, r28 # Combine pieces of source + mskqh r28, r21, r28 # Trim low junk off source + andnot r21, 7, r21 # Adjust dst for partial move + bge r20, unrol2 # Taken branch for long strings + addq r20, 16, r16 # Add back: how many whole QW's? + nop +short2: and r20, 7, r20 # How many odd bytes? + blt r16, last # Skip if no more whole QW's + or r28, r27, r28 # Combine pieces + stq r28, (r21) + extql r19, r18, r27 # Get last part of prior src QW + ldq_u r19, 16(r18) # Load another src QW + addq r21, 8, r21 # Update dst + subq r16, 8, r16 # More whole QW's? + addq r18, 8, r18 # Update src + blt r16, lastx # Skip if no more whole QWs + extqh r19, r18, r28 # Get first part of this src QW + addq r18, 8, r18 # Update src again + or r28, r27, r28 # Combine pieces + stq r28, (r21) + extql r19, r18, r27 # Get last part of this src QW + ldq_u r19, 8(r18) # Load another src QW + addq r21, 8, r21 # Update dst +lastx: extqh r19, r18, r28 # Get first part of this src QW +last: addq r18, r20, r16 # Point to end-8 of src + beq r20, done_u # Skip if no odd bytes + or r28, r27, r28 # Combine parts of last whole QW + ldq_u r27, 7(r16) # Load final (maybe same) src QW + subq r20, 4, r16 # More than 4 bytes left? + stq r28, (r21) # Store last whole QW + extql r19, r18, r19 # Get last part of this src QW + extqh r27, r18, r27 # Get what we need from final src QW +joinx: ldq r28, 8(r21) # Load last QW of destination + or r19, r27, r27 # Combine pieces of source + mskql r27, r20, r27 # Trim to length + mskqh r28, r20, r28 # Make room in destination + bgt r16, done_u # Go store a whole QW + addq r20, 8, r20 # Increment length for return + or r28, r27, r28 # Insert src into dst + stl r28, 8(r21) # Final LW + addq r21, r20, r16 # Point to end of dst for return + ret r31, (r26) + + # Come here to move 4 thru 7 bytes. + # +lss8: addq r18, r19, r27 # Recover src+len-8 + and r16, 3, r16 # Dst alignment within LW + ldq_u r27, 7(r27) # Load last part of source + extql r28, r18, r28 # Extract first part of source + beq r16, lw # Handle LW-aligned dst + extqh r27, r18, r27 # Extract last part of source + ldl r18, (r21) # Load first LW of dst + addq r16, r20, r20 # align+len-4 of dst + or r28, r27, r28 # Complete source + mskql r28, r19, r28 # Trim source to length + mskql r18, r16, r18 # Make room in dst + insql r28, r16, r27 # Position src like dst + addq r16, r19, r19 # Align+len-8 of dst + or r27, r18, r18 # Merge + stl r18, (r21) # Store first LW of dst + extql r27, 4, r27 # Position next LW of src + blt r19, zz # Skip if not a whole LW + stl r27, 4(r21) # Store the whole LW + addq r21, 4, r21 # Adjust pointer + subq r20, 4, r20 # Adjust ending alignment + beq r19, donezz # Exit if done + insqh r28, r16, r27 # Position remainder of src +zz: ldl r28, 4(r21) # Load last dst LW + mskqh r28, r20, r28 # Make room in dst + or r28, r27, r27 # Merge + stl r27, 4(r21) # Final store +donezz: addq r21, r20, r16 # End address -4 + addq r16, 4, r16 + ret r31, (r26) + +lw: extqh r27, r18, r27 # Extract last part of source + addq r21, 4, r16 # Adjust for return value + beq r20, lwdone # Skip if exactly 4 bytes + ldl r19, 4(r21) # Load next dst LW + or r27, r28, r28 # Complete source + stl r28, (r21) # Store first LW + extql r28, 4, r28 # Position rest of source + mskqh r19, r20, r27 # Make room in dst + mskql r28, r20, r28 # Trim src + or r27, r28, r28 # Merge + stl r28, 4(r21) + addq r16, r20, r16 # Update return value + ret r31, (r26) + +lwdone: or r27, r28, r28 # Merge + stl r28, (r21) + ret r31, (r26) + + # Move 4 bytes to an aligned LW. + # +simple: ldq_u r27, 3(r18) # Load last QW of source + extql r28, r18, r28 # Position first QW + addq r21, 4, r16 # Point to end of dst for return + extqh r27, r18, r27 # Position last QW + or r28, r27, r28 # Merge + stl r28, (r21) # Store + ret r31, (r26) + + + # Dst is not aligned. Check whether first write is to a LW or a QW, + # and whether that finishes the move. Then see if src alignment + # matches, and read/rewrite the first dst quadword. + # + # r16 = dst alignment in QW + # r17 = remaining length for return + # r18-> src + # r19 + # r20 = len-4 + # r21-> LW-aligned dst + # r27 = QW_alignment + length - 4 + # r28 = first src QW + + #.align quad + +part: subq r27, 4, r19 # Does dst end in first QW? + ldq_u r27, (r21) # Load first dst QW + blt r19, shortu # Go handle short store + and r16, 4, r19 # Does it start in high LW? + subq r18, r16, r18 # Adjust src for this partial move + beq r19, quad # Whole QW to be touched + extql r28, r18, r19 # Position first part of source + ldq_u r28, 7(r18) # Get next (or same) src QW + mskql r27, r16, r27 # Trim destination for merge + addq r20, r16, r20 # Len + alignment... + extqh r28, r18, r28 # Position second part of source + subq r20, 4, r20 # Len+alignment-8 = remaining len + or r28, r19, r28 # Pieces of source + mskqh r28, r16, r19 # Trim junk preceding source + ldq_u r28, 7(r18) # Get src QW again ** + or r27, r19, r19 # Combine other source piece + extql r19, 4, r19 # Get the high LW + stl r19, (r21) # Store just that + + # Now at a QW boundary. Is there a QW left to store? + # Is the source QW aligned? + + andnot r21, 7, r21 # Adjust dst pointer to next-8 + subq r20, 8, r19 # Got a QW more? + and r18, 7, r27 # Src aligned? + blt r19, short3 # Too short + addq r21, 8, r21 + subq r20, 8, r20 + ldq_u r28, 8(r18) + addq r18, 8, r18 + subq r20, 32, r19 # Prepare for unrolled loop + beq r27, align # Alignment matches + or r31, r31, r27 + or r31, r31, r16 + br r31, misal + +shortu: addq r18, r20, r20 # Point to end-4 of src + ldq_u r20, 3(r20) # Get last QW of source + extql r28, r18, r28 # Fetch first QW of source + extqh r20, r18, r20 # Fetch last QW of source + mskql r27, r16, r18 # Clear from start thru end of dst + mskqh r27, r19, r27 # Clear from 0 to end of dst + or r28, r20, r28 # Combine src pieces + insql r28, r16, r28 # Position src + or r27, r18, r27 # Combine dst pieces + mskql r28, r19, r28 # Trim src + addq r21, r19, r20 # Final pointer for return + or r28, r27, r28 # Merge src & dst + stq_u r28, (r21) # Store it + addq r20, 8, r16 + ret r31, (r26) + +quad: and r18, 7, r19 # Is src address now aligned? + subq r20, 4, r20 # Get length-8 + bne r19, misal # Go handle mismatched alignment + mskqh r28, r16, r28 # Keep desired part of source + addq r20, r16, r20 # Adjust count for this partial move + mskql r27, r16, r27 # Keep desired part of destination QW + subq r20, 32, r19 # At least 4 quadwords left to move? + or r27, r28, r28 # Merge source and destination + + # Src alignment matches. + # r16 + # r17 = remaining length for return + # r18 = next src pointer -8 + # r19 = remaining length -32 + # r20 + # r21 = dst pointer + # r27 + # r28 = dst quadword + +align: and r19, 24, r20 # How many after a multiple of 4? + bge r19, unrol1 # Taken branch for long strings + nop +short1: and r19, 7, r19 # How many odd bytes? + beq r20, last28 # Skip if no more whole QWs after r28 + ldq r27, 8(r18) # Load next QW + addq r18, 8, r18 + stq r28, (r21) # Store prior QW + subq r20, 16, r20 # Map 8/16/24 to -8/0/8 + addq r21, 8, r21 + blt r20, last27 # Skip if no more after r27 + ldq r28, 8(r18) # Load next QW + addq r18, 8, r18 + stq r27, (r21) # Store prior QW + addq r21, 8, r21 + nop + beq r20, last28 + ldq r27, 8(r18) # Load next QW + addq r18, 8, r18 + stq r28, (r21) # Store prior QW + addq r21, 8, r21 +last27: beq r19, done27 # Skip if no odd bytes + ldq r28, 8(r18) # Load one more src QW + ldq r18, 8(r21) # Load last destination QW + subq r19, 4, r16 # More than 4 bytes to store? + stq r27, (r21) # Store prior QW + mskql r28, r19, r27 # Trim source + mskqh r18, r19, r18 # Trim destination + ble r16, lastl # Go store just a LW +lastq: addq r21, r19, r21 # End-8 of dst for return + or r27, r18, r27 # Merge src & dst +done27: stq_u r27, 7(r21) # Store last destination QW + addq r21, 8, r16 # End of dst for return + ret r31, (r26) + +short3: addq r18, r20, r16 # Point to end-8 of src + beq r20, donexx # Completely done? + ldq_u r19, 7(r16) # Load final src QW + subq r20, 4, r16 # Got more than a LW? + beq r27, joinx # Don't include prior src if aligned + extql r28, r18, r27 # Last part of prior src QW + extqh r19, r18, r19 # First part of this src QW + br joinx + +donexx: addq r21, r20, r16 + addq r16, 8, r16 + ret r31, (r26) + +last28: beq r19, done28 # Skip if no odd bytes + ldq r27, 8(r18) # Load one more src QW + ldq r18, 8(r21) # Load last destination QW + subq r19, 4, r16 # More than 4 bytes to store? + stq r28, (r21) # Store prior QW + mskql r27, r19, r27 # Trim source + mskqh r18, r19, r18 # Trim destination + bgt r16, lastq # Go store a QW +lastl: addq r19, 8, r19 # Increment length for return + or r27, r18, r27 # Merge src & dst + stl r27, 8(r21) # Store last destination LW + addq r21, r19, r16 # End of dst for return + ret r31, (r26) + +shortq: addq r18, r20, r16 # Point to end-8 of src + ldq r27, (r21) # Get dst QW + extql r28, r18, r28 # Position first src QW + ldq_u r19, 7(r16) # Get last QW of src + mskqh r27, r20, r27 # Mask dst QW + extqh r19, r18, r19 # Position last src QW + or r19, r28, r28 # Merge + mskql r28, r20, r28 # Trim src QW +done_u: addq r21, r20, r21 # End-8 of dst for return + or r28, r27, r28 # Combine pieces +done28: stq_u r28, 7(r21) # Store last destination QW + addq r21, 8, r16 # End of dst for return + ret r31, (r26) + + # Unrolled loop for long moves with matching alignment within QW. + # Each iteration moves two cache blocks. + # We try to schedule the cache misses to avoid a double miss + # in EV4 pass 2.1 chips. If the source alignment within a cache + # block is exactly 3, alter it, since that case runs slower. + # + # R16 + # R17 = remaining length for return + # R18 = src pointer + # R19 = remaining length (to load) - 32 + # R20 = length & 24 (needed at return) + # R21 = dst pointer + # R27 + # R28 = QW from 0(R18) to store at 0(R21), both on input and at return + # + + #.align quad + +unrol1: ldq r27, 32(r18) # Cache miss here; later loads hit. + subq r19, 48, r16 # Six more quadwords? + and r18, 16, r20 # Starting in 2nd half of cache block? + blt r16, uent1 # If not 6 more, don't adjust. + ldq r16, 8(r18) + beq r20, utop1 # If in 1st half, don't adjust. + ldq r27, 48(r18) # Cache miss here + addq r18, 16, r18 + stq r28, (r21) # Adjust by going ahead 1/2 block. + addq r21, 16, r21 + ldq r28, (r18) + subq r19, 16, r19 + stq r16, -8(r21) + nop + ldq r16, 8(r18) +utop1: subq r19, 32, r19 + +uloop1: ldq r20, 64(r18) # Cache miss here + stq r28, (r21) + ldq r28, 16(r18) + stq r16, 8(r21) + ldq r16, 24(r18) + addq r18, 64, r18 + stq r28, 16(r21) + mov r20, r28 + stq r16, 24(r21) + addq r21, 64, r21 + ldq r20, -24(r18) + subq r19, 32, r19 + blt r19, uexit1 + ldq r16, 32(r18) # Cache miss here + stq r27, -32(r21) + ldq r27, -16(r18) + stq r20, -24(r21) + ldq r20, -8(r18) + stq r27, -16(r21) + mov r16, r27 + stq r20, -8(r21) +uent1: subq r19, 32, r19 + ldq r16, 8(r18) + bge r19, uloop1 + + # finish last block of 4 quadwords + # +ubot1: stq r28, (r21) + mov r27, r28 # Position last QW for return + ldq r27, 16(r18) + addq r18, 32, r18 + stq r16, 8(r21) + addq r21, 32, r21 +uex1a: ldq r16, -8(r18) + and r19, 24, r20 # Recover count of remaining QW's + stq r27, -16(r21) + stq r16, -8(r21) + br r31, short1 + + nop +uexit1: stq r27, -32(r21) # Here if exit from middle of loop + ldq r27, -16(r18) + stq r20, -24(r21) + br r31, uex1a # Join common exit sequence + + #.align quad + +unrol2: ldq_u r16, 16(r18) # Load next src QW + extql r19, r18, r19 # Get last part of prior one + or r28, r27, r28 # Combine pieces + stq r28, (r21) # Store prior dst QW + subq r20, 24, r20 # Update loop counter + extqh r16, r18, r28 # Get first part of a src QW + ldq_u r27, 24(r18) # Load next src QW + extql r16, r18, r16 # Get last part of prior one + or r28, r19, r28 # Combine pieces + stq r28, 8(r21) # Store prior dst QW + addq r21, 24, r21 # Update dst pointer + extqh r27, r18, r28 # Get first part of a src QW + ldq_u r19, 32(r18) # Load next src QW + extql r27, r18, r27 # Get last part of prior one + or r28, r16, r28 # Combine pieces + stq r28, -8(r21) # Store prior dst QW + addq r18, 24, r18 # Update src pointer + extqh r19, r18, r28 # Get first part of a src QW + bge r20, unrol2 # Repeat as needed + addq r20, 16, r16 # How many whole quadwords left? + br r31, short2 # Go handle leftovers + nop + + # Must move in reverse order because of overlap. + # r16 = dst address + # r17 = remaining length for return + # r18 = src address + # r19 + # r20 = len-4 (>= 0) + # r21 + # r27 + # r28 + + # Not yet LW-granularity... + +reverse: + subq r20, 4, r20 # This code expects len-8 + addq r20, r18, r18 # Point to end-8 of source + addq r20, r16, r19 # Point to end-8 of destination + and r19, 7, r21 # Is destination aligned? + ldq_u r28, 7(r18) # Get source QW + addq r19, 8, r16 # Point to end of dst for return + bne r21, rpart # Skip if partial write needed + and r18, 7, r27 # Is source aligned too? + beq r27, ralign # Skip if so + ldq_u r21, (r18) # Handle aligned dst, unaligned src + subq r20, 8, r20 + extqh r28, r18, r28 + extql r21, r18, r27 + br r31, rwhole + +rmis: ldq_u r21, (r18) # Load same or preceding src QW + extqh r28, r18, r28 # Get last part of source to store + mskqh r27, r16, r27 # Keep high-address part of dst + extql r21, r18, r21 + subq r20, 8, r20 # How many more whole QW's? + or r21, r28, r28 + ldq_u r21, (r18) # Reload source QW + mskql r28, r16, r28 # Trim source to length +rwhole: blt r20, rlast2 # Skip if no more whole QW's +rloop2: or r28, r27, r28 # Combine pieces + stq r28, (r19) +rent2: extqh r21, r18, r27 + ldq_u r21, -8(r18) + subq r20, 8, r20 + subq r19, 8, r19 + extql r21, r18, r28 + subq r18, 8, r18 + bge r20, rloop2 +rlast2: and r20, 7, r20 + beq r20, rdone2 + or r28, r27, r28 + subq r18, r20, r27 + stq r28, (r19) +rl2ent: subq r31, r20, r20 + ldq_u r27, (r27) + extqh r21, r18, r21 + ldq r28, -8(r19) + subq r19, 8, r19 + extql r27, r18, r27 + mskql r28, r20, r28 + or r27, r21, r27 + mskqh r27, r20, r27 + and r20, 4, r21 # Ending in high LW? + bne r21, rdone3 # Only longword store at the end +rdone2: or r28, r27, r28 + stq r28, (r19) + ret r31, (r26) + +rdone3: or r28, r27, r28 + extql r28, 4, r28 + stl r28, 4(r19) + ret r31, (r26) + +rpart: ldq_u r27, 7(r19) # Get dst QW + subq r21, 8, r21 # Get negative of bytes not moved + subq r18, r21, r18 # From src-8, get src after partial + subq r20, r21, r20 # Adjust length for partial move + subq r19, r21, r19 # Adjust dst pointer + addq r21, 4, r21 # End alignment - 4 + ble r21, r_lw # Only storing the low longword? + and r18, 7, r21 # Src alignment now matching dst? + bne r21, rmis # Go back if not + mskql r28, r16, r28 # Keep low addresses of src QW + mskqh r27, r16, r27 # Keep high address of dst QW +ralign: subq r20, 8, r20 # How many more whole QW's? + or r27, r28, r28 # Combine + blt r20, rlast1 # Skip if this is the end +rloop1: stq r28, (r19) # Store one QW +rent1: subq r20, 8, r20 # Decrement length + ldq r28, -8(r18) # Load preceding QW + subq r19, 8, r19 # Decrement dst pointer + subq r18, 8, r18 # Decrement src pointer + bge r20, rloop1 # Repeat for each whole QW +rlast1: and r20, 7, r20 # How many odd bytes? + beq r20, rdone # Skip if none + ldq r27, -8(r18) # Get another source QW + subq r31, r20, r20 # Get byte # to end at + stq r28, (r19) +rl_ent: ldq r28, -8(r19) + subq r19, 8, r19 # Adjust dst pointer again + mskqh r27, r20, r27 # Keep top of src QW + and r20, 4, r21 # Ending in high LW? + mskql r28, r20, r28 # Keep bottom of dst QW + bne r21, rdone4 # Only longword store at the end + or r27, r28, r28 # Combine +rdone: stq r28, (r19) # Store last QW + ret r31, (r26) + +rdone4: or r27, r28, r28 # Combine + extql r28, 4, r28 # Get high part + stl r28, 4(r19) # Store last LW + ret r31, (r26) + +r_lw: and r18, 7, r21 # Src alignment now matching dst? + bne r21, rmislw # Go back if not + mskql r28, r16, r28 # Keep low addresses of src QW + mskqh r27, r16, r27 # Keep high address of dst QW + subq r20, 8, r20 # How many more whole QW's? + or r27, r28, r28 # Combine + blt r20, rlast1_lw # Skip if this is the end + stl r28, (r19) # Store one QW + br r31, rent1 + +rlast1_lw: + and r20, 7, r20 # How many odd bytes? + ldq r27, -8(r18) # Get another source QW + subq r31, r20, r20 # Get byte # to end at + stl r28, (r19) + br rl_ent + +rmislw: ldq_u r21, (r18) # Load same or preceding src QW + extqh r28, r18, r28 # Get last part of source to store + mskqh r27, r16, r27 # Keep high-address part of dst + extql r21, r18, r21 + subq r20, 8, r20 # How many more whole QW's? + or r21, r28, r28 + ldq_u r21, (r18) # Reload source QW + mskql r28, r16, r28 # Trim source to length + blt r20, rlast2_lw # Skip if no more whole QW's + or r28, r27, r28 # Combine pieces + stl r28, (r19) + br r31, rent2 + +rlast2_lw: + and r20, 7, r20 + or r28, r27, r28 + subq r18, r20, r27 + stl r28, (r19) + br r31, rl2ent + + .set at + .set reorder + .end _OtsMove |