author: Adam <you@example.com> 2020-05-17 05:51:50 +0200
committer: Adam <you@example.com> 2020-05-17 05:51:50 +0200
commit: e611b132f9b8abe35b362e5870b74bce94a1e58e (patch)
tree: a5781d2ec0e085eeca33cf350cf878f2efea6fe5 /private/ntos/rtl/alpha/mvmem.s
download: NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar
NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.gz
NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.bz2
NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.lz
NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.xz
NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.zst
NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.zip
1 files changed, 1920 insertions, 0 deletions
diff --git a/private/ntos/rtl/alpha/mvmem.s b/private/ntos/rtl/alpha/mvmem.s
new file mode 100644
index 000000000..c5ccc9a81
--- /dev/null
+++ b/private/ntos/rtl/alpha/mvmem.s
@@ -0,0 +1,1920 @@
+//      TITLE("Compare, Move, Zero, and Fill Memory Support")
+//++
+//
+// Copyright (c) 1992  Digital Equipment Corporation
+//
+// Module Name:
+//
+//    mvmem.s
+//
+// Abstract:
+//
+//    This module implements functions to compare, move, zero, and fill
+//    blocks of memory. If the memory is aligned, then these functions
+//    are very efficient.
+//
+//    N.B. These routines MUST preserve all floating state since they are
+//        frequently called from interrupt service routines that normally
+//        do not save or restore floating state.
+//
+// Author:
+//
+//    Joe Notarangelo  21-May-1992
+//
+// Environment:
+//
+//    User or Kernel mode.
+//
+// Revision History:
+//
+//    Monty VanderBilt 14-Feb-1996 Avoid memory loads and branch takens between 
+//                                 load lock and store conditional instructions
+//                                 to conform with all alpha architecture rules.
+//    Monty VanderBilt 27-Feb-1996 Added RtlZeroBytes and RtlFillBytes to support
+//                                 byte granularity access when necessary.
+//--
+
+#include "ksalpha.h"
+
+        SBTTL("Compare Memory")
+//++
+//
+// ULONG
+// RtlCompareMemory (
+//    IN PVOID Source1,
+//    IN PVOID Source2,
+//    IN ULONG Length
+//    )
+//
+// Routine Description:
+//
+//    This function compares two blocks of memory and returns the number
+//    of bytes that compared equal.
+//
+// Arguments:
+//
+//    Source1 (a0) - Supplies a pointer to the first block of memory to
+//       compare.
+//
+//    Source2 (a1) - Supplies a pointer to the second block of memory to
+//       compare.
+//
+//    Length (a2) - Supplies the length, in bytes, of the memory to be
+//       compared.
+//
+// Return Value:
+//
+//    The number of bytes that compared equal is returned as the function
+//    value. If all bytes compared equal, then the length of the orginal
+//    block of memory is returned.
+//
+//--
+
+
+        LEAF_ENTRY(RtlCompareMemory)
+
+        bis     a2, zero, v0            // save length of comparison
+        beq     a2, 90f                 // (JAE) quit if nothing to compare
+        xor     a0, a1, t0              // check for compatible alignment
+        and     t0, 0x7, t0             // low bits only
+        bne     t0, CompareUnaligned    // if ne, incompatible alignment
+
+//
+// Compare memory aligned
+//
+
+CompareAligned:                         //
+
+//
+// compare memory until sources are aligned
+//
+        and     a0, 0x7, t0             // get low bits
+        bne     t0, 10f                 // if ne, sources not aligned yet
+        br      zero, 30f               // already aligned, predicted
+
+
+10:
+        ldq_u   t1, 0(a0)               // get unaligned quad at source 1
+        ldq_u   t2, 0(a1)               // get unaligned quad at source 2
+
+20:
+        extbl   t1, t0, t4              // byte at t0 in source 1 quad
+        extbl   t2, t0, t5              // byte at t0 in source 2 quad
+        xor     t4, t5, t3              // t1 = t2 ?
+        bne     t3, 110f                // not equal, miscompare
+        subq    a2, 1, a2               // decrement bytes to compare
+        beq     a2, 90f                 // if eq, compare success
+        addq    t0, 1, t0               // increment pointer within quad
+        cmpeq   t0, 8, t3               // t0 = 8?, if so first quadword done
+        beq     t3, 20b                 // continue while t0 < 8
+
+
+        addq    a0, 8, a0               // increment to next quadword
+        addq    a1, 8, a1               // increment source 2 to next also
+        bic     a0, 7, a0               // align source 1 quadword
+        bic     a1, 7, a1               // align source 2 quadword
+
+
+//
+// aligned block compare, compare blocks of 64 bytes
+//
+
+30:
+        srl     a2, 6, t0               // t0 = number of 64 byte blocks
+        beq     t0, 50f                 // if eq, no 64 byte blocks
+
+//
+// N.B. loads from each of the sources were separated in case these
+//      blocks are fighting for the cache
+//
+        .set    noat
+40:
+        ldq     t1, 0(a0)               // t1 = source 1, quad 0
+        ldq     t2, 8(a0)               // t2 = source 1, quad 1
+        ldq     t3, 16(a0)              // t3 = source 1, quad 2
+        addq    a1, 64, a1              // increment source 2 pointer
+        ldq     t4, 24(a0)              // t4 = source 1, quad 3
+
+        ldq     t5, -64(a1)             // t5 = source 2, quad 0
+        ldq     a4, -56(a1)             // a4 = source 2, quad 1
+        ldq     a5, -48(a1)             // a5 = source 2, quad 2
+        xor     t1, t5, $at             // quad 0 match?
+        bne     $at, 200f               // if ne[false], miscompare
+        ldq     t5, -40(a1)             // t5 = source 2, quad 3
+        ldq     t1, 32(a0)              // t1 = source 1, quad 4
+        xor     t2, a4, $at             // quad 1 match?
+        bne     $at, 122f               // if ne[false], miscompare
+        ldq     t2, 40(a0)              // t2 = source 1, quad 5
+        xor     t3, a5, $at             // quad 2 match?
+        bne     $at, 124f               // if ne[false], miscompare
+        ldq     t3, 48(a0)              // t3 = source 1, quad 6
+        xor     t4, t5, $at             // quad 3 match?
+        bne     $at, 126f               // if ne[false], miscompare
+        ldq     t4, 56(a0)              // t4 = source 1, quad 7
+
+        ldq     t5, -32(a1)             // t5 = source 2, quad 4
+        addq    a0, 64, a0              // increment source 1 pointer
+        ldq     a4, -24(a1)             // a4 = source 2, quad 5
+        subq    t0, 1, t0               // decrement blocks to compare
+        ldq     a5, -16(a1)             // a5 = source 2, quad 6
+        xor     t1, t5, $at             // quad 4 match?
+        bne     $at, 130f               // if ne[false], miscompare
+        ldq     t5, -8(a1)              // t5 = source 2, quad 7
+        xor     t2, a4, $at             // quad 5 match?
+        bne     $at, 132f               // if ne[false], miscompare
+        xor     t3, a5, $at             // quad 6 match?
+        bne     $at, 134f               // if ne[false], miscompare
+        xor     t4, t5, $at             // quad 7 match?
+        bne     $at, 136f               // if ne[false], miscompare
+        subq    a2, 64, a2              // decrement bytes to compare
+        bne     t0, 40b                 // if ne, more blocks to compare
+        .set    at
+
+
+//
+// Compare quadwords
+//
+
+50:
+        srl     a2, 3, t0               // t0 = number of quadwords to compare
+        beq     t0, 70f                 // if eq, no quadwords to compare
+
+        .set    noat
+60:
+        ldq     t1, 0(a0)               // t1 = quad from source 1
+        lda     a0, 8(a0)               // increment source 1 pointer
+        ldq     t2, 0(a1)               // t2 = quad from source 2
+        lda     a1, 8(a1)               // increment source 2 pointer
+        xor     t1, t2, $at             // are quadwords equal?
+        bne     $at, 200f               // if ne, miscompare
+        subq    t0, 1, t0               // decrement quads to compare
+        subq    a2, 8, a2               // decrement bytes to compare
+        bne     t0, 60b                 // if ne, more quads to compare
+
+        .set    at
+
+//
+// Compare bytes in last quadword
+//
+
+//      a2 =  number of bytes to compare, less than 8, greater than zero
+//      a0, a1, quad-aligned to last quadword
+
+        beq     a2, 80f                 // if eq, all bytes compared
+
+        .set    noat
+70:
+        ldq     t1, 0(a0)               // t1 = quad at source 1
+        ldq     t2, 0(a1)               // t2 = quad at source 2
+        bis     zero, 0xff, t0          // zap mask
+        sll     t0, a2, t0              //
+        zap     t1, t0, t1              // zero bytes not compared
+        zap     t2, t0, t2              // same for source 2
+        xor     t1, t2, $at             // compare quadwords
+        bne     $at, 200f               // if ne, miscompare
+
+        .set    at
+//
+// Successful compare
+//      v0 already contains full length
+//
+
+80:
+        ret     zero, (ra)              // return
+
+
+//
+// Sources have incompatible alignment
+//
+CompareUnaligned:
+
+
+//
+// Compare until source 1 (a0) is aligned
+//
+
+        and     a0, 0x7, t0             // get byte position of pointer
+        beq     t0, 30f                 // if eq, already aligned
+
+        ldq_u   t1, 0(a0)               // get unaligned quad at a0
+
+10:
+        ldq_u   t2, 0(a1)               // get unaligned quad at a1
+        extbl   t1, t0, t4              // get byte to compare from source 1
+        extbl   t2, a1, t2              // get byte to compare from source 2
+        xor     t4, t2, t3              // do bytes match?
+        bne     t3, 110f                // if ne, miscompare
+        subq    a2, 1, a2               // decrement bytes to compare
+        beq     a2, 90f                 // (JAE) quit if nothing left to compare
+        addq    t0, 1, t0               // increment byte within source 1
+        addq    a1, 1, a1               // increment source 2 pointer
+        cmpeq   t0, 8, t3               // finished with source 1 quad?
+        beq     t3, 10b                 // if eq[false], more to compare
+
+        addq    a0, 7, a0               // point to next source 1 quad
+        bic     a0, 7, a0               // align to quadword
+
+
+//
+// Compare 64-byte blocks
+//
+
+30:
+        srl     a2, 6, t0               // t0 = number of blocks to compare
+        beq     t0, 50f                 // if eq, no blocks to move
+
+        ldq_u   t1, 0(a1)               // get source 2 unaligned quad 1
+
+        .set    noat
+40:
+        ldq_u   t2, 7(a1)               // get source 2 unaligned quad 2
+        addq    a0, 64, a0              // increment source 1 pointer
+        ldq_u   t3, 15(a1)              // get source 2 unaligned quad 3
+        extql   t1, a1, t1              // bytes from unaligned quad 1
+        extqh   t2, a1, $at             // bytes from unaligned quad 2
+        ldq_u   t4, 23(a1)              // get source 2 unaligned quad 4
+        bis     t1, $at, t1             // t1 = quadword 1 (source 2)
+        ldq_u   t5, 31(a1)              // get source 2 unaligned quad 5
+        extql   t2, a1, t2              // bytes from unaligned quad 2
+        extqh   t3, a1, $at             // bytes from unaligned quad 3
+        ldq     a3, -64(a0)             // a3 = quadword 1 (source 1)
+        bis     t2, $at, t2             // t2 = quadword 2 (source 2)
+        ldq     a4, -56(a0)             // a4 = quadword 2 (source 1)
+        extql   t3, a1, t3              // bytes from unaligned quad 3
+        extqh   t4, a1, $at             // bytes from unaligned quad 4
+        ldq     a5, -48(a0)             // a5 = quadword 3 (source 1)
+        bis     t3, $at, t3             // t3 = quadword 3 (source 2)
+        extql   t4, a1, t4              // bytes from unaligned quad 4
+        extqh   t5, a1, $at             // bytes from unaligned quad 5
+        subq    t0, 1, t0               // decrement blocks to compare
+        bis     t4, $at, t4             // t4 = quadword 4 (source 2)
+
+        xor     t1, a3, $at             // match on quadword 1?
+        ldq     a3, -40(a0)             // a3 = quadword 4 (source 1)
+        bne     $at, 200f               // if ne, miscompare quad 1
+        xor     t2, a4, $at             // match on quadword 2?
+        ldq_u   t2, 39(a1)              // get source 2 unaligned quad 6
+        bne     $at, 122f               // if ne, miscompare quad 2
+        xor     t3, a5, $at             // match on quadword 3?
+        ldq_u   t3, 47(a1)              // get source 2 unaligned quad 7
+        bne     $at, 124f               // if ne, miscompare quad 3
+        xor     t4, a3, $at             // match on quadword 4?
+        ldq_u   t4, 55(a1)              // get source 2 unaligned quad 8
+        bne     $at, 126f               // if ne, miscompare quad 4
+        ldq_u   t1, 63(a1)              // get source 2 unaligned quad 9
+
+        ldq     a3, -32(a0)             // a3 = quadword 5 (source 1)
+        extql   t5, a1, t5              // bytes from unaligned quad 5
+        extqh   t2, a1, $at             // bytes from unaligned quad 6
+        ldq     a4, -24(a0)             // a4 = quadword 6 (source 1)
+        ldq     a5, -16(a0)             // a5 = quadword 7 (source 1)
+        bis     t5, $at, t5             // t5 = quadword 5 (source 2)
+
+        xor     t5, a3, $at             // match on quadword 5?
+        ldq     a3, -8(a0)              // a3 = quadword 8 (source 1)
+        bne     $at, 130f               // if ne, miscompare quad 5
+        extql   t2, a1, t2              // bytes from unaligned quad 6
+        extqh   t3, a1, $at             // bytes from unaligned quad 7
+        extql   t3, a1, t3              // bytes from unaligned quad 7
+        bis     t2, $at, t2             // t2 = quadword 6 (source 2)
+        xor     t2, a4, $at             // match on quadword 6?
+        bne     $at, 132f               // if ne, miscompare quad 6
+        extqh   t4, a1, $at             // bytes from unaligned quad 8
+        extql   t4, a1, t4              // bytes from unaligned quad 8
+        bis     t3, $at, t3             // t3 = quadword 7 (source 2)
+        xor     t3, a5, $at             // match on quadword 7?
+        bne     $at, 134f               // if ne, miscompare quad 7
+        extqh   t1, a1, $at             // bytes from unaligned quad 9
+        addq    a1, 64, a1              // increment source 2 pointer
+        bis     t4, $at, t4             // t4 = quadword 8 (source 2)
+        xor     t4, a3, $at             // match on quadword 8?
+        bne     $at, 136f               // if ne, miscompare quad 8
+        subq    a2, 64, a2              // decrement number of bytes to compare
+        bne     t0, 40b                 // if ne, more blocks to compare
+
+        .set    at
+
+//
+// Compare quadwords
+//
+
+
+50:
+        srl     a2, 3, t0               // t0 = number of quads to compare
+        beq     t0, 70f                 // if eq, no quads to compare
+        ldq_u   t1, 0(a1)               // get unaligned quad 1 (source 2)
+
+        .set    noat
+60:
+        ldq_u   t2, 7(a1)               // get unaligned quad 2 (source 2)
+        ldq     t3, 0(a0)               // t3 = quadword 1 (source 1)
+        extql   t1, a1, t1              // get bytes from unaligned quad 1
+        extqh   t2, a1, $at             // get bytes from unaligned quad 2
+        addq    a1, 8, a1               // increment source 2 pointer
+        bis     t1, $at, t1             // t1 = quadword 1 (source 2)
+        xor     t1, t3, $at             // match on quadword?
+        bne     $at, 200f               // if ne, miscompare
+        subq    t0, 1, t0               // decrement quadwords to compare
+        addq    a0, 8, a0               // increment source 1 pointer
+        subq    a2, 8, a2               // decrement bytes to compare
+        bis     t2, zero, t1            // save low quadword for next loop
+        bne     t0, 60b                 // if ne, more quads to compare
+
+        .set    at
+
+//
+// Compare bytes for final quadword
+//
+
+70:
+        beq     a2, 90f                 // if eq, comparison complete
+
+        ldq     t1, 0(a0)               // get quadword from source 1
+        bis     zero, zero, t0          // t0 = byte position to compare
+
+        .set    noat
+80:
+        ldq_u   t2, 0(a1)               // get unaligned quad from source 2
+        extbl   t1, t0, t3              // t3 = byte from source 1
+        extbl   t2, a1, t2              // t2 = byte from source 2
+        xor     t3, t2, $at             // match on byte?
+        bne     $at, 100f               // if ne, miscompare on byte
+        addq    t0, 1, t0               // increment byte position
+        addq    a1, 1, a1               // increment source 2 pointer
+        subq    a2, 1, a2               // decrement bytes to compare
+        bne     a2, 80b                 // if ne, more bytes to compare
+
+        .set    at
+//
+// Successful full comparison
+//
+
+90:
+        ret     zero, (ra)              // return, v0 already set
+
+
+//
+// Miscompare on last quadword
+//
+
+100:
+        subq    v0, a2, v0              // subtract bytes not compared
+        ret     zero, (ra)              // return
+
+//
+// Miscompare on first quadword, unaligned case
+//
+//  v0 = total bytes to compare
+//  a2 = bytes remaining to compare
+//
+
+110:
+        subq    v0, a2, v0              // bytes compared successfully
+        ret     zero, (ra)              // return
+
+//
+// Miscompare on 64-byte block compare
+//
+
+122:
+        subq    a2, 8, a2               // miscompare on quad 2
+        br      zero, 200f              // finish in common code
+
+124:
+        subq    a2, 16, a2              // miscompare on quad 3
+        br      zero, 200f              // finish in common code
+
+126:
+        subq    a2, 24, a2              // miscompare on quad 4
+        br      zero, 200f              // finish in common code
+
+130:
+        subq    a2, 32, a2              // miscompare on quad 5
+        br      zero, 200f              // finish in common code
+
+132:
+        subq    a2, 40, a2              // miscompare on quad 6
+        br      zero, 200f              // finish in common code
+
+134:
+        subq    a2, 48, a2              // miscompare on quad 7
+        br      zero, 200f              // finish in common code
+
+136:
+        subq    a2, 56, a2              // miscompare on quad 8
+        br      zero, 200f              // finish in common code
+
+//
+// Miscompare, determine number of bytes that successfully compared
+//      $at = xor of relevant quads from sources, must be non-zero
+//      a2 = number of bytes left to compare
+//
+        .set    noat
+200:
+        cmpbge  zero, $at, $at          // $at = mask of non-zero bytes
+
+        //
+        // look for the first bit cleared in $at, this is the
+        //      number of the first byte which differed
+        //
+        bis     zero, zero, t0          // bit position to look for clear
+
+210:
+        blbc    $at, 220f               // if low clear, found difference
+        srl     $at, 1, $at             // check next bit
+        addq    t0, 1, t0               // count bit position checked
+        br      zero, 210b
+
+220:
+        subq    v0, a2, v0              // subtract bytes yet to compare
+        addq    v0, t0, v0              // add bytes that matched on last quad
+
+        ret     zero, (ra)
+
+        .set    at
+
+        .end    RtlCompareMemory
+
+
+
+        SBTTL("Move Memory")
+//++
+//
+// VOID
+// RtlMoveMemory (
+//    IN PVOID Destination,
+//    IN PVOID Source,
+//    IN ULONG Length
+//    )
+//
+// Routine Description:
+//
+//    This function moves memory either forward or backward, aligned or
+//    unaligned, in 64-byte blocks, followed by 8-byte blocks, followed
+//    by any remaining bytes.
+//
+// Arguments:
+//
+//    Destination (a0) - Supplies a pointer to the destination address of
+//       the move operation.
+//
+//    Source (a1) - Supplies a pointer to the source address of the move
+//       operation.
+//
+//    Length (a2) - Supplies the length, in bytes, of the memory to be moved.
+//
+// Return Value:
+//
+//    None.
+//
+//--
+
+        LEAF_ENTRY(RtlMoveMemory)
+
+        beq     a2, 80f                 // if eq, no bytes to move
+//
+// If the source address is less than the destination address and source
+// address plus the length of the move is greater than the destination
+// address, then the source and destination overlap such that the move
+// must be performed backwards.
+//
+
+        cmpult  a0, a1, t0              // is destination less than source
+        bne     t0, MoveForward         // if eq [true] no overlap possible
+        addq    a1, a2, t0              // compute source ending address
+        cmpult  t0, a0, t1              // is source end less than dest.
+        beq     t1, MoveBackward        // if eq [false], overlap
+
+//
+// Move memory forward aligned and unaligned.
+//
+
+MoveForward:                            //
+        xor     a0, a1, t0              // compare alignment bits
+        and     t0, 0x7, t0             // isloate alignment comparison
+        bne     t0, MoveForwardUnaligned  // if ne, incompatible alignment
+
+//
+// Move memory forward aligned.
+//
+
+MoveForwardAligned:                     //
+
+//
+// Move bytes until source and destination are quadword aligned
+//
+
+        and     a0, 0x7, t0             // t0 = unaligned bits
+        bne     t0, 5f                  // if ne, not quad aligned
+        br      zero, 20f               // predicted taken
+
+5:
+        ldq_u   t2, 0(a0)               // get unaligned quad from dest.
+        ldq_u   t1, 0(a1)               // get unaligned quadword from source
+10:
+        beq     a2, 15f                 // if eq, all bytes moved
+        extbl   t1, t0, t3              // t3 = byte from source
+        insbl   t3, t0, t3              // t3 = byte from source, in position
+        mskbl   t2, t0, t2              // clear position in dest. quad
+        bis     t2, t3, t2              // merge in byte from source
+        subq    a2, 1, a2               // decrement bytes to move
+        addq    t0, 1, t0               // increment byte within quad
+        cmpeq   t0, 8, t3               // finished the quadword?
+        beq     t3, 10b                 // if eq [false], do next byte
+15:
+        stq_u   t2, 0(a0)               // store merged destination bytes
+
+        addq    a0, 7, a0               // move to next quadword
+        bic     a0, 7, a0               // aligned quadword
+
+        addq    a1, 7, a1               // move to next quadword
+        bic     a1, 7, a1               // aligned quadword
+
+//
+// Check for 64-byte block moves
+//
+
+20:
+        srl     a2, 6, t0               // t0 = number of 64 byte blocks
+        beq     t0, 40f                 // if eq no blocks to move
+        and     a2, 64-1, a2            // a2 = residual bytes
+
+30:
+        ldq     t1, 0(a1)               // load 64 bytes from source
+        addq    a0, 64, a0              // increment destination pointer
+        ldq     v0, 56(a1)              //
+        ldq     a3, 32(a1)              //
+        stq     t1, -64(a0)             // write to destination
+        ldq     t2, 8(a1)               //   into volatile registers
+        ldq     t3, 16(a1)              //
+        ldq     t4, 24(a1)              //
+        subq    t0, 1, t0               // decrement number of blocks
+        stq     t2, -56(a0)             //
+        ldq     a4, 40(a1)              //
+        stq     t3, -48(a0)             //
+        ldq     a5, 48(a1)              //
+        stq     t4, -40(a0)             //
+        addq    a1, 64, a1              // increment source pointer
+        stq     a3, -32(a0)             //
+        stq     a4, -24(a0)             //
+        stq     a5, -16(a0)             //
+        stq     v0, -8(a0)              //
+        bne     t0, 30b                 // if ne, more blocks to copy
+
+//
+// Copy quadwords
+//
+
+40:
+        srl     a2, 3, t0               // t0 = number of quadwords to move
+        beq     t0, 60f                 // if eq no quadwords to move
+        and     a2, 8-1, a2             // a2 = residual bytes
+
+50:
+        ldq     t1, 0(a1)               // load quadword from source
+        addq    a1, 8, a1               // increment source pointer
+        stq     t1, 0(a0)               // store quadword to destination
+        addq    a0, 8, a0               // increment destination pointer
+        subq    t0, 1, t0               // decrement number of quadwords
+        bne     t0, 50b                 // if ne, more quadwords to move
+
+//
+// Move final residual bytes
+//
+
+60:
+        beq     a2, 80f                 // if eq, no more bytes to move
+        ldq     t1, 0(a1)               // get last source quadword
+        ldq     t2, 0(a0)               // get last dest. quadword
+        bis     zero, zero, t0          // t0 = next byte number to move
+
+70:
+        extbl   t1, t0, t3              // extract byte from source
+        insbl   t3, t0, t3              // t3 = source byte, in position
+        mskbl   t2, t0, t2              // clear byte position for dest.
+        bis     t2, t3, t2              // merge in source byte
+        addq    t0, 1, t0               // increment byte position
+        subq    a2, 1, a2               // decrement bytes to move
+        bne     a2, 70b                 // if ne => more bytes to move
+
+        stq     t2, 0(a0)               // store merged data
+
+//
+// Finish aligned MoveForward
+//
+
+80:
+        ret     zero, (ra)              // return
+
+
+
+//
+// Move memory forward unaligned.
+//
+
+MoveForwardUnaligned:                   //
+
+
+//
+// Move bytes until the destination is aligned
+//
+
+        and     a0, 0x7, t0             // t0 = unaligned bits
+        beq     t0, 100f                // if eq, destination quad aligned
+
+        ldq_u   t2, 0(a0)               // get unaligned quad from dest
+
+90:
+        beq     a2, 95f                 // if eq no more bytes to move
+        ldq_u   t1, 0(a1)               // get unaligned quad from source
+        extbl   t1, a1, t1              // extract source byte
+        insbl   t1, t0, t1              // t1 = source byte, in position
+        mskbl   t2, t0, t2              // clear byte position in dest.
+        bis     t2, t1, t2              // merge in source byte
+        addq    t0, 1, t0               // increment byte position
+        addq    a1, 1, a1               // increment source pointer
+        subq    a2, 1, a2               // decrement bytes to move
+        cmpeq   t0, 8, t3               // t0 = 8? => quad finished
+        beq     t3, 90b                 // if eq [false], more bytes to move
+95:
+        stq_u   t2, 0(a0)               // store merged quadword
+        addq    a0, 7, a0               // increment to next quad
+        bic     a0, 7, a0               // align next quadword
+
+//
+// Check for 64-byte blocks to move
+//
+
+100:
+        srl     a2, 6, t0               // t0 = number of blocks to move
+        beq     t0, 120f                // if eq no blocks to move
+        and     a2, 64-1, a2            // a2 = residual bytes to move
+
+
+        ldq_u   t1, 0(a1)               // t1 = first unaligned quad
+
+110:
+                                        // get source data and merge it
+                                        //  as we go
+        ldq_u   t2, 7(a1)               // t2 = second unaligned quad
+        extql   t1, a1, t1              // extract applicable bytes from t1
+        extqh   t2, a1, v0              // extract applicable bytes from t2
+        bis     t1, v0, t1              // t1 = quad #1
+        ldq_u   t3, 15(a1)              // t3 = third unaligned quad
+        extql   t2, a1, t2              // extract applicable bytes from t2
+        extqh   t3, a1, v0              // extract applicable bytes from t3
+        stq     t1, 0(a0)               // store quad #1
+        bis     t2, v0, t2              // t2 = quad #2
+        ldq_u   t4, 23(a1)              // t4 = fourth unaligned quad
+        extql   t3, a1, t3              // extract applicable bytes from t3
+        extqh   t4, a1, v0              // extract applicable bytes from t4
+        stq     t2, 8(a0)               // store quad #2
+        bis     t3, v0, t3              // t3 = quad #3
+        ldq_u   t5, 31(a1)              // t5 = fifth unaligned quad
+        extql   t4, a1, t4              // extract applicable bytes from t4
+        extqh   t5, a1, v0              // extract applicable bytes from t5
+        stq     t3, 16(a0)              // store quad #3
+        bis     t4, v0, t4              // t4 = quad #4
+        ldq_u   a3, 39(a1)              // a3 = sixth unaligned quad
+        extql   t5, a1, t5              // extract applicable bytes from t5
+        extqh   a3, a1, v0              // extract applicable bytes from a3
+        stq     t4, 24(a0)              // store quad #4
+        bis     t5, v0, t5              // t5 = quad #5
+        ldq_u   a4, 47(a1)              // a4 = seventh unaligned quad
+        extql   a3, a1, a3              // extract applicable bytes from a3
+        extqh   a4, a1, v0              // extract applicable bytes from a4
+        stq     t5, 32(a0)              // store quad #5
+        bis     a3, v0, a3              // a3 = quad #6
+        ldq_u   a5, 55(a1)              // a5 = eighth unaligned quad
+        extql   a4, a1, a4              // extract applicable bytes from a4
+        extqh   a5, a1, v0              // extract applicable bytes from a5
+        stq     a3, 40(a0)              // store quad #6
+        bis     a4, v0, a4              // a4 = quad #7
+        ldq_u   t1, 63(a1)              // t1 = ninth unaligned = 1st of next
+        extql   a5, a1, a5              // extract applicable bytes from a5
+        extqh   t1, a1, v0              // extract applicable bytes from t1
+        stq     a4, 48(a0)              // store quad #7
+        bis     a5, v0, a5              // a5 = quad #8
+        addq    a1, 64, a1              // increment source pointer
+        stq     a5, 56(a0)              // store quad #8
+        addq    a0, 64, a0              // increment destination pointer
+        subq    t0, 1, t0               // decrement number of blocks
+        bne     t0, 110b                // if ne, more blocks to move
+
+//
+// Move unaligned source quads to aligned destination quads
+//
+
+120:
+        srl     a2, 3, t0               // t0 = number of quads to move
+        beq     t0, 140f                // if eq no quads to move
+        and     a2, 8-1, a2             // a2 = residual bytes
+
+
+        ldq_u   t1, 0(a1)               // t1 = first unaligned quad
+130:
+        ldq_u   t2, 7(a1)               // t2 = second unaligned quad
+        addq    a0, 8, a0               // increment destination pointer
+        extql   t1, a1, t1              // extract applicable bytes from t1
+        extqh   t2, a1, v0              // extract applicable bytes from t2
+        bis     t1, v0, t1              // t1 = quadword of data
+        stq     t1, -8(a0)              // store data to destination
+        addq    a1, 8, a1               // increment source pointer
+        subq    t0, 1, t0               // decrement quads to move
+        bis     t2, zero, t1            // t1 = first of next unaligned pair
+        bne     t0, 130b                // if ne, more quads to move
+
+//
+// Move remaining bytes to final quadword
+//
+
+
+140:
+        beq     a2, 160f                // if eq no more bytes to move
+        ldq     t2, 0(a0)               // t2 = destination quadword
+        bis     zero, zero, t3          // t3 = position for next insertion
+
+150:
+        ldq_u   t1, 0(a1)               // get unaligned source quad
+        extbl   t1, a1, t1              // t1 = source byte
+        insbl   t1, t3, t1              // t1 = source byte, in position
+        mskbl   t2, t3, t2              // clear byte in destination
+        bis     t2, t1, t2              // merge in source byte
+        addq    a1, 1, a1               // increment source pointer
+        subq    a2, 1, a2               // decrement bytes to move
+        addq    t3, 1, t3               // increment destination position
+        bne     a2, 150b                // more bytes to move
+
+        stq     t2, 0(a0)               // store merged data
+
+//
+// Finish unaligned MoveForward
+//
+
+160:
+        ret     zero, (ra)              // return
+
+
+//
+// Move memory backward.
+//
+
+MoveBackward:                           //
+
+        addq    a0, a2, a0              // compute ending destination address
+        addq    a1, a2, a1              // compute ending source address
+        subq    a0, 1, a0               // point to last destination byte
+        subq    a1, 1, a1               // point to last source byte
+        xor     a0, a1, t0              // compare alignment bits
+        and     t0, 0x7, t0             // isolate alignment comparison
+        bne     t0, MoveBackwardUnaligned   // if ne, incompatible alignment
+
+//
+// Move memory backward aligned.
+//
+
+MoveBackwardAligned:                    //
+
+//
+// Move bytes until source and destination are quadword aligned
+//
+
+        and     a0, 0x7, t0             // t0 = unaligned bits
+        cmpeq   t0, 7, t1               // last byte position 7?
+        beq     t1, 5f                  // if eq [false], not quad aligned
+        subq    a0, 7, a0               // point to beginning of last quad
+        subq    a1, 7, a1               // point to beginning of last quad
+        br      zero, 30f               // predicted taken
+
+5:
+        ldq_u   t1, 0(a0)               // get unaligned quad from dest.
+        ldq_u   t2, 0(a1)               // get unaligned quad from source
+
+10:
+        beq     a2, 20f                 // if eq, all bytes moved
+        extbl   t2, t0, t3              // t3 = byte from source
+        insbl   t3, t0, t3              // t3 = byte from source, in position
+        mskbl   t1, t0, t1              // clear position in destination
+        bis     t1, t3, t1              // merge in byte from source
+        subq    a2, 1, a2               // decrement bytes to move
+        subq    t0, 1, t0               // decrement byte within quadword
+        cmplt   t0, zero, t3            // finished the quadword?
+        beq     t3, 10b                 // if eq [false], do next byte
+
+20:
+        stq_u   t1, 0(a0)               // store merged destination bytes
+
+        subq    a0, 8, a0               // move to previous quadword
+        bic     a0, 7, a0               // aligned quadword
+
+        subq    a1, 8, a1               // move to previous quadword
+        bic     a1, 7, a1               // aligned quadword
+
+//
+// Check for 64-byte block moves
+//
+
+30:
+
+        srl     a2, 6, t0               // t0 = number of 64 byte blocks
+        beq     t0, 50f                 // if eq, no blocks to move
+        and     a2, 64-1, a2            // a2 = residual bytes
+
+40:
+        ldq     t1, 0(a1)               // load 64 bytes from source into
+        subq    a0, 64, a0              // decrement destination pointer
+        ldq     v0, -56(a1)             //
+        ldq     a3, -32(a1)             //
+        stq     t1, 64(a0)              // write to destination
+        ldq     t2, -8(a1)              //   into volatile registers
+        ldq     a5, -48(a1)             //
+        ldq     a4, -40(a1)             //
+        stq     t2, 56(a0)              //
+        ldq     t3, -16(a1)             //
+        ldq     t4, -24(a1)             //
+        subq    a1, 64, a1              // decrement source pointer
+        stq     t3, 48(a0)              //
+        stq     t4, 40(a0)              //
+        stq     a3, 32(a0)              //
+        subq    t0, 1, t0               // decrement number of blocks
+        stq     a4, 24(a0)              //
+        stq     a5, 16(a0)              //
+        stq     v0, 8(a0)               //
+        bne     t0, 40b                 // if ne, more blocks to copy
+
+//
+// Copy quadwords
+//
+
+50:
+        srl     a2, 3, t0               // t0 = number of quadwords to move
+        beq     t0, 70f                 // if eq no quadwords to move
+        and     a2, 8-1, a2             // a2 = residual bytes
+
+60:
+        ldq     t1, 0(a1)               // load quadword from source
+        subq    a1, 8, a1               // decrement source pointer
+        stq     t1, 0(a0)               // store quadword to destination
+        subq    a0, 8, a0               // decrement destination pointer
+        subq    t0, 1, t0               // decrement quadwords to move
+        bne     t0, 60b                 // if ne, more quadwords to move
+
+//
+// Move final residual bytes
+//
+
+70:
+        beq     a2, 90f                 // if eq, no more bytes to move
+        ldq     t1, 0(a1)               // get last source quadword
+        ldq     t2, 0(a0)               // get last destination quadword
+        bis     zero, 7, t0             // t0 = next byte number to move
+
+80:
+        extbl   t1, t0, t3              // extract byte from source
+        insbl   t3, t0, t3              // t3 = source byte, in position
+        mskbl   t2, t0, t2              // clear byte position for dest.
+        bis     t2, t3, t2              // merge in source byte
+        subq    t0, 1, t0               // decrement byte position
+        subq    a2, 1, a2               // decrement bytes to move
+        bne     a2, 80b                 // if ne, more bytes to move
+
+        stq     t2, 0(a0)               // write destination data
+//
+// Finish aligned MoveBackward
+//
+
+90:
+
+        ret     zero, (ra)              // return
+
+
+//
+// Move memory backward unaligned.
+//
+
+MoveBackwardUnaligned:                  //
+
+
+//
+// Move bytes until the destination is aligned
+//
+
+        and     a0, 0x7, t0             // t0 = unaligned bits
+        cmpeq   t0, 7, t1               // last byte of a quadword
+        beq     t1, 95f                 // if eq[false], not aligned
+        subq    a0, 7, a0               // align pointer to beginning of quad
+        br      zero, 120f              //
+
+95:
+        ldq_u   t2, 0(a0)               // get unaligned quad from dest.
+
+100:
+        beq     a2, 110f                // if eq, no more bytes to move
+        ldq_u   t1, 0(a1)               // get unaligned quad from source
+        extbl   t1, a1, t1              // extract source byte
+        insbl   t1, t0, t1              // t1 = source byte in position
+        mskbl   t2, t0, t2              // clear byte position in dest.
+        bis     t2, t1, t2              // merge source byte
+        subq    t0, 1, t0               // decrement byte position
+        subq    a1, 1, a1               // decrement source pointer
+        subq    a2, 1, a2               // decrement number of bytes to move
+        cmplt   t0, zero, t3            // t0 < 0? => quad finished
+        beq     t3, 100b                // if eq [false], more bytes to move
+
+110:
+        stq_u   t2, 0(a0)               // store merged quadword
+
+        subq    a0, 8, a0               // decrement dest. to previous quad
+        bic     a0, 7, a0               // align previous quadword
+
+//
+// Check for 64-byte blocks to move
+//
+
+120:
+
+        srl     a2, 6, t0               // t0 = number of blocks to move
+        subq    a1, 7, a1               // point to beginning of last quad
+        beq     t0, 140f                // if eq no blocks to move
+        and     a2, 64-1, a2            // a2 = residual bytes to move
+
+        ldq_u   t1, 7(a1)               // t1 = first unaligned quad
+
+130:
+                                        // get source data and merge it
+                                        //  as we go
+        ldq_u   t2, 0(a1)               // t2 = second unaligned quad
+        extqh   t1, a1, t1              // extract applicable bytes from t1
+        extql   t2, a1, v0              // extract applicable bytes from t2
+        bis     t1, v0, t1              // t1 = quad #1
+        ldq_u   t3, -8(a1)              // t3 = third unaligned quad
+        extqh   t2, a1, t2              // extract applicable bytes from t2
+        extql   t3, a1, v0              // extract applicable bytes from t3
+        stq     t1, 0(a0)               // store quad #1
+        bis     t2, v0, t2              // t2 = quad #2
+        ldq_u   t4, -16(a1)             // t4 = fourth unaligned quad
+        extqh   t3, a1, t3              // extract applicable bytes from t3
+        extql   t4, a1, v0              // extract applicable bytes from t4
+        stq     t2, -8(a0)              // store quad #2
+        bis     t3, v0, t3              // t3 = quad #3
+        ldq_u   t5, -24(a1)             // t5 = fifth unaligned quad
+        extqh   t4, a1, t4              // extract applicable bytes from t4
+        extql   t5, a1, v0              // extract applicable bytes from t5
+        stq     t3, -16(a0)             // store quad #3
+        bis     t4, v0, t4              // t4 = quad #4
+        ldq_u   a3, -32(a1)             // a3 = sixth unaligned quad
+        extqh   t5, a1, t5              // extract applicable bytes from t5
+        extql   a3, a1, v0              // extract applicable bytes from a3
+        stq     t4, -24(a0)             // store quad #4
+        bis     t5, v0, t5              // t5 = quad #5
+        ldq_u   a4, -40(a1)             // a4 = seventh unaligned quad
+        extqh   a3, a1, a3              // extract applicable bytes from a3
+        extql   a4, a1, v0              // extract applicable bytes from a4
+        stq     t5, -32(a0)             // store quad #5
+        bis     a3, v0, a3              // a3 = quad #6
+        ldq_u   a5, -48(a1)             // a5 = eighth unaligned quad
+        extqh   a4, a1, a4              // extract applicable bytes from a4
+        extql   a5, a1, v0              // extract applicable bytes from a5
+        stq     a3, -40(a0)             // store quad #6
+        bis     a4, v0, a4              // a4 = quad #7
+        ldq_u   t1, -56(a1)             // t1 = ninth unaligned = 1st of next
+        extqh   a5, a1, a5              // extract applicable bytes from a5
+        extql   t1, a1, v0              // extract applicable bytes from t1
+        stq     a4, -48(a0)             // store quad #7
+        bis     a5, v0, a5              // a5 = quad #8
+        subq    a1, 64, a1              // increment source pointer
+        stq     a5, -56(a0)             // store quad #8
+        subq    a0, 64, a0              // increment destination pointer
+        subq    t0, 1, t0               // decrement number of blocks
+        bne     t0, 130b                // if ne, more blocks to move
+
+
+//
+// Move unaligned source quads to aligned destination quads
+//
+
+140:
+        srl     a2, 3, t0               // t0 = number of quads to move
+        beq     t0, 160f                // if eq no quads to move
+        and     a2, 8-1, a2             // a2 = residual bytes
+
+        ldq_u   t1, 7(a1)               // t1 = first unaligned quad
+
+150:
+        ldq_u   t2, 0(a1)               // t2 = second unaligned quad
+        subq    a0, 8, a0               // decrement destination pointer
+        extqh   t1, a1, t1              // extract applicable bytes from t1
+        extql   t2, a1, v0              // extract applicable bytes from t2
+        bis     t1, v0, t1              // t1 = quadword of data
+        stq     t1, 8(a0)               // store data to destination
+        subq    a1, 8, a1               // decrement source pointer
+        subq    t0, 1, t0               // decrement quads to move
+        bis     t2, zero, t1            // t1 = first of next unaligned pair
+        bne     t0, 150b                // if ne, more quads to move
+
+//
+// Move remaining bytes to final quadword
+//
+
+160:
+        beq     a2, 180f                // if eq, no more bytes to move
+        ldq     t2, 0(a0)               // t2 = destination quadword
+        bis     zero, 7, t0             // t0 = position for next insertion
+
+170:
+        subq    a1, 1, a1               // decrement source pointer
+        ldq_u   t1, 8(a1)               // get unaligned source quad
+        extbl   t1, a1, t1              // t1 = source byte
+        insbl   t1, t0, t1              // t1 = source byte, in position
+        mskbl   t2, t0, t2              // clear byte position
+        bis     t2, t1, t2              // merge in source byte
+        subq    t0, 1, t0               // decrement byte position for dest.
+        subq    a2, 1, a2               // decrement bytes to move
+        bne     a2, 170b                // if ne, more bytes to move
+
+        stq     t2, 0(a0)               //
+
+//
+// Finish unaligned MoveBackward
+//
+
+180:
+        ret     zero, (ra)              // return
+
+        .end    RtlMoveMemory
+
+        SBTTL("Zero Memory")
+//++
+//
+// VOID
+// RtlZeroMemory (
+//    IN PVOID Destination,
+//    IN ULONG Length
+//    )
+//
+// Routine Description:
+//
+//    This function zeros memory by first aligning the destination address to
+//    a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte
+//    blocks, followed by any remaining bytes.
+//
+// Arguments:
+//
+//    Destination (a0) - Supplies a pointer to the memory to zero.
+//
+//    Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
+//
+// Return Value:
+//
+//    None.
+//
+//--
+
+        LEAF_ENTRY(RtlZeroMemory)
+
+        bis     zero, zero, a2          // set fill pattern
+        br      zero, RtlpFillMemory    //
+
+
+        SBTTL("Fill Memory")
+//++
+//
+// VOID
+// RtlFillMemory (
+//    IN PVOID Destination,
+//    IN ULONG Length,
+//    IN UCHAR Fill
+//    )
+//
+// Routine Description:
+//
+//    This function fills memory by first aligning the destination address to
+//    a longword boundary, and then filling 32-byte blocks, followed by 4-byte
+//    blocks, followed by any remaining bytes.
+//
+// Arguments:
+//
+//    Destination (a0) - Supplies a pointer to the memory to fill.
+//
+//    Length (a1) - Supplies the length, in bytes, of the memory to be filled.
+//
+//    Fill (a2) - Supplies the fill byte.
+//
+//    N.B. The alternate entry memset expects the length and fill arguments
+//         to be reversed.  It also returns the Destination pointer
+//
+// Return Value:
+//
+//    None.
+//
+//--
+
+        ALTERNATE_ENTRY(memset)
+
+        bis     a0, zero, v0            // set return value
+        bis     a1, zero, a3            // swap length and fill arguments
+        bis     a2, zero, a1            //
+        bis     a3, zero, a2            //
+
+        ALTERNATE_ENTRY(RtlFillMemory)
+
+        and     a2, 0xff, a2            // clear excess bits
+        sll     a2, 8, t0               // duplicate fill byte
+        bis     a2, t0, a2              // generate fill word
+        sll     a2, 16, t0              // duplicate fill word
+        bis     a2, t0, a2              // generate fill longword
+        sll     a2, 32, t0              // duplicate fill longword
+        bis     a2, t0, a2              // generate fill quadword
+
+.align 3                                // ensure quadword aligned target
+//
+// Fill memory with the pattern specified in register a2.
+//
+
+RtlpFillMemory:                         //
+
+//
+// Align destination to quadword
+//
+
+        beq     a1, 80f                 // anything to fill? (paranoia)
+        and     a0, 8-1, t0             // t0 = unaligned bits
+        bne     t0, 5f                  // if ne, then not quad aligned
+        br      zero, 20f               // if eq, then quad aligned
+
+5:
+        ldq_u   t1, 0(a0)               // get unaligned quadword
+                                        //   for first group of bytes
+10:
+        beq     a1, 15f                 // if eq no more bytes to fill
+        insbl   a2, t0, t2              // get fill byte into position
+        mskbl   t1, t0, t1              // clear byte for fill
+        bis     t1, t2, t1              // put in fill byte
+        addq    t0, 1, t0               // increment to next byte position
+        subq    a1, 1, a1               // decrement bytes to fill
+        cmpeq   t0, 8, t2               // t0 = 8?
+        beq     t2, 10b                 // if eq [false] more bytes to do
+
+15:
+        stq_u   t1, 0(a0)               // store modified bytes
+        addq    a0, 7, a0               // move a0 to next quadword
+        bic     a0, 7, a0               // align a0 to quadword
+
+//
+// Check for 64-byte blocks
+//
+
+20:
+        srl     a1, 6, t0               // t0 = number of 64 byte blocks
+        beq     t0, 40f                 // if eq then no 64 byte blocks
+        and     a1, 64-1, a1            // a1 = residual bytes to fill
+
+30:
+        stq     a2, 0(a0)               // store 64 bytes
+        stq     a2, 8(a0)               //
+        stq     a2, 16(a0)              //
+        stq     a2, 24(a0)              //
+        stq     a2, 32(a0)              //
+        stq     a2, 40(a0)              //
+        stq     a2, 48(a0)              //
+        stq     a2, 56(a0)              //
+
+        subq    t0, 1, t0               // decrement blocks remaining
+        addq    a0, 64, a0              // increment destination pointer
+        bne     t0, 30b                 // more blocks to write
+
+
+
+//
+// Fill aligned quadwords
+//
+
+40:
+        srl     a1, 3, t0               // t0 = number of quadwords
+        bne     t0, 55f                 // if ne quadwords left to fill
+        br      zero, 60f               // if eq no quadwords left
+
+55:
+        and     a1, 8-1, a1             // a1 = residual bytes to fill
+
+50:
+        stq     a2, 0(a0)               // store quadword
+        subq    t0, 1, t0               // decrement quadwords remaining
+        addq    a0, 8, a0               // next quadword
+        bne     t0, 50b                 // more quadwords to write
+
+
+//
+// Fill bytes for last quadword
+//
+
+60:
+        bne     a1, 65f                 // if ne bytes remain to be filled
+        br      zero, 80f               // if eq no more bytes to fill
+
+65:
+        ldq     t1, 0(a0)               // get last quadword
+        bis     zero, zero, t0          // t0 = byte position to start fill
+
+70:
+        beq     a1, 75f                 // if eq, no more bytes to fill
+        insbl   a2, t0, t2              // get fill byte into position
+        mskbl   t1, t0, t1              // clear fill byte position
+        bis     t1, t2, t1              // insert fill byte
+        addq    t0, 1, t0               // increment byte within quad
+        subq    a1, 1, a1               // decrement bytes to fill
+        cmpeq   t0, 8, t3               // t0 = 8? => finished quad
+        beq     t3, 70b                 // if eq [false] more bytes to fill
+
+75:
+        stq     t1, 0(a0)               // write merged quadword
+
+//
+// Finish up
+//
+
+80:
+        ret     zero, (ra)              // return
+
+
+        .end    RtlZeroMemory
+
+        SBTTL("Fill Memory Ulong")
+//++
+//
+// VOID
+// RtlFillMemoryUlong (
+//    IN PVOID Destination,
+//    IN ULONG Length,
+//    IN ULONG Pattern
+//    )
+//
+// Routine Description:
+//
+//    This function fills memory with the specified longowrd pattern by
+//    filling 64-byte blocks followed by 8-byte blocks and finally
+//    4-byte blocks.
+//
+//    N.B. This routine assumes that the destination address is aligned
+//         on a longword boundary and that the length is an even multiple
+//         of longwords.
+//
+// Arguments:
+//
+//    Destination (a0) - Supplies a pointer to the memory to fill.
+//
+//    Length (a1) - Supplies the length, in bytes, of the memory to be filled.
+//
+//    Pattern (a2) - Supplies the fill pattern.
+//
+// Return Value:
+//
+//    None.
+//
+//--
+
+        LEAF_ENTRY(RtlFillMemoryUlong)
+
+        bic     a1, 3, a1               // make sure length is an even number
+                                        // of longwords
+        sll     a2, 32, a3              // a3 = long pattern in upper 32 bits
+        srl     a3, 32, t0              // clear upper bits, pattern in lower 32
+        bis     a3, t0, a3              // a3 = quad version of fill pattern
+
+//
+// Make destination address quad-aligned
+//
+
+        and     a0, 4, t0               // is a0 quad aligned?
+        beq     t0, 10f                 // if eq, then a0 quad aligned
+        stl     a2, 0(a0)               // fill first longword
+        addq    a0, 4, a0               // quad align a0
+        subq    a1, 4, a1               // bytes remaining to store
+
+//
+// Check for 64-byte blocks to fill
+//
+
+10:
+        srl     a1, 6, t0               // t0 = # 64-byte blocks to fill
+        beq     t0, 30f                 // if eq no 64 byte blocks
+        and     a1, 64-1, a1            // a1 = residual bytes
+
+20:
+        stq     a3, 0(a0)               // store 64 bytes
+        stq     a3, 8(a0)               //
+        stq     a3, 16(a0)              //
+        stq     a3, 24(a0)              //
+        stq     a3, 32(a0)              //
+        stq     a3, 40(a0)              //
+        stq     a3, 48(a0)              //
+        stq     a3, 56(a0)              //
+        subq    t0, 1, t0               // t0 = blocks remaining
+        addq    a0, 64, a0              // increment address pointer
+        bne     t0, 20b                 // if ne more blocks to fill
+
+//
+// Fill 8 bytes at a time while we can, a1 = bytes remaining
+//
+
+30:
+        srl     a1, 3, t0               // t0 = # quadwords to fill
+        beq     t0, 50f                 // if eq no quadwords left
+        and     a1, 8-1, a1             // a1 = residual bytes
+40:
+        stq     a3, 0(a0)               // store quadword
+        subq    t0, 1, t0               // t0 = quadwords remaining
+        addq    a0, 8, a0               // increment address pointer
+        bne     t0, 40b                 // if ne more quadwords to fill
+
+//
+// Fill last 4 bytes
+//
+
+50:
+        beq     a1, 60f                 // if eq no longwords remain
+        stl     a2, 0(a0)               // fill last longword
+
+//
+// Finish up
+//
+
+60:
+        ret     zero, (ra)              // return to caller
+
+
+        .end    RtlFillMemoryUlong
+
+        SBTTL("Copy Memory With Byte Granularity")
+//++
+//
+// VOID
+// RtlCopyBytes (
+//    IN PVOID Destination,
+//    IN PVOID Source,
+//    IN ULONG Length
+//    )
+//
+// Routine Description:
+//
+//    This function copies non-overlapping memory, aligned or unaligned, in
+//    64-byte blocks, followed by 8-byte blocks, followed by any remaining
+//    bytes.  Unlike RtlCopyMemory or RtlMoveMemory the copy is done such
+//    that byte granularity is assured for all platforms.
+//
+// Arguments:
+//
+//    Destination (a0) - Supplies a pointer to the destination address of
+//       the move operation.
+//
+//    Source (a1) - Supplies a pointer to the source address of the move
+//       operation.
+//
+//    Length (a2) - Supplies the length, in bytes, of the memory to be moved.
+//
+// Return Value:
+//
+//    None.
+//
+//--
+
+        LEAF_ENTRY(RtlCopyBytes)
+
+//
+// Move memory forward aligned and unaligned.
+//
+
+        xor     a0, a1, t0              // compare alignment bits
+        and     t0, 0x7, t0             // isolate alignment comparison
+        bne     t0, CopyForwardUnaligned // if ne, incompatible alignment
+
+//
+// Source and Destination buffers have the same alignment. Move
+// bytes until done or source and destination are quadword aligned
+//
+
+        and     a0, 0x7, t0             // t0 = unaligned bits
+        bne     t0, 5f                  // if ne, not quad aligned
+        br      zero, 20f               // predicted taken
+5:
+        bis     zero, zero, t1          // t4 = destination byte zap mask
+        bis     zero, 1, t2
+        sll     t2, t0, t2              // t2 = next bit to set in zap mask
+10:
+        beq     a2, 15f                 // if eq, all bits set
+        bis     t1, t2, t1              // set bit in zap mask
+        sll     t2, 1, t2               // set next higher bit for zap mask
+        subq    a2, 1, a2               // decrement bytes to move
+        addq    t0, 1, t0               // increment byte within quad
+        cmpeq   t0, 8, t3               // finished the quadword?
+        beq     t3, 10b                 // if eq [false], do next byte
+15:
+        ldq_u   t2, 0(a1)               // get unaligned quadword from source
+        zapnot  t2, t1, t2              // clear source bytes
+        bic     a0, 7, a3               // a3 = quadword base of destination
+retry1:
+        ldq_l   t0, 0(a3)               // load destination quadword
+        zap     t0, t1, t0              // clear destination bytes
+        or      t0, t2, t0              // merge in bytes from source
+        stq_c   t0, 0(a3)               // store merged quadword conditional
+        beq     t0, retry1f             // if eq, retry failed interlock
+
+        addq    a0, 7, a0               // move to next quadword
+        bic     a0, 7, a0               // aligned quadword
+
+        addq    a1, 7, a1               // move to next quadword
+        bic     a1, 7, a1               // aligned quadword
+
+//
+// Check for 64-byte block moves
+//
+
+20:
+        srl     a2, 6, t0               // t0 = number of 64 byte blocks
+        beq     t0, 40f                 // if eq no blocks to move
+        and     a2, 64-1, a2            // a2 = residual bytes
+
+30:
+        ldq     t1, 0(a1)               // load 64 bytes from source
+        addq    a0, 64, a0              // increment destination pointer
+        ldq     v0, 56(a1)              //
+        ldq     a3, 32(a1)              //
+        stq     t1, -64(a0)             // write to destination
+        ldq     t2, 8(a1)               //   into volatile registers
+        ldq     t3, 16(a1)              //
+        ldq     t4, 24(a1)              //
+        subq    t0, 1, t0               // decrement number of blocks
+        stq     t2, -56(a0)             //
+        ldq     a4, 40(a1)              //
+        stq     t3, -48(a0)             //
+        ldq     a5, 48(a1)              //
+        stq     t4, -40(a0)             //
+        addq    a1, 64, a1              // increment source pointer
+        stq     a3, -32(a0)             //
+        stq     a4, -24(a0)             //
+        stq     a5, -16(a0)             //
+        stq     v0, -8(a0)              //
+        bne     t0, 30b                 // if ne, more blocks to copy
+
+//
+// Copy quadwords
+//
+
+40:
+        srl     a2, 3, t0               // t0 = number of quadwords to move
+        beq     t0, 60f                 // if eq no quadwords to move
+        and     a2, 8-1, a2             // a2 = residual bytes
+
+50:
+        ldq     t1, 0(a1)               // load quadword from source
+        addq    a1, 8, a1               // increment source pointer
+        stq     t1, 0(a0)               // store quadword to destination
+        addq    a0, 8, a0               // increment destination pointer
+        subq    t0, 1, t0               // decrement number of quadwords
+        bne     t0, 50b                 // if ne, more quadwords to move
+
+//
+// Move final residual bytes
+//
+
+60:
+        beq     a2, 80f                 // if eq, no more bytes to move
+        mov     a2, t0                  // t0 = number of bytes to move
+        mov     -1, t1                  // t1 = bit mask
+        sll     t0, 3, t0               // # of bytes to # of bits
+        srl     t1, t0, t1              // clear t0 bits
+        sll     t1, t0, t0              // move it back
+        ldq     t1, 0(a1)               // get last source quadword
+        bic     t1, t0, t1              // clear bytes not copied
+        not     t0, t0                  // complement to clear destination
+retry2:
+        ldq_l   t2, 0(a0)               // get last destination quadword locked
+        bic     t2, t0, t2              // clear bytes to be copied
+        bis     t2, t1, t2              // move bytes from source
+        stq_c   t2, 0(a0)               // store merged quadword conditional
+        beq     t2, retry2f             // if eq, retry failed interlock
+
+//
+// Finish aligned MoveForward
+//
+
+80:
+        ret     zero, (ra)              // return
+
+//
+// Move memory forward unaligned.
+//
+
+CopyForwardUnaligned:                   //
+
+//
+// Move bytes until the destination is aligned
+//
+
+        and     a0, 0x7, t0             // t0 = unaligned bits
+        beq     t0, 100f                // if eq, destination quad aligned
+        bis     zero, zero, t1          // t4 = destination byte zap mask
+        bis     zero, 1, t2
+        sll     t2, t0, t2              // t2 = next bit to set in zap mask
+        mov     zero, t4                // assemble destination bytes here
+90:
+        beq     a2, 95f                 // if eq no more bytes to move
+        bis     t1, t2, t1              // set bit in zap mask
+        sll     t2, 1, t2               // set next higher bit for zap mask
+        ldq_u   t5, 0(a1)               // get unaligned quad from source
+        extbl   t5, a1, t5              // extract source byte
+        insbl   t5, t0, t5              // t5 = source byte, in position
+        or      t4, t5, t4              // merge in source byte
+        addq    t0, 1, t0               // increment byte position
+        addq    a1, 1, a1               // increment source pointer
+        subq    a2, 1, a2               // decrement bytes to move
+        cmpeq   t0, 8, t3               // t0 = 8? => quad finished
+        beq     t3, 90b                 // if eq [false], more bytes to move
+95:
+        bic     a0, 0x7, a3             // a3 = quadword base of destination
+retry3:
+        ldq_l   t0, 0(a3)               // load destination quadword
+        zap     t0, t1, t0              // clear destination bytes
+        or      t0, t4, t0              // merge in bytes from source
+        stq_c   t0, 0(a3)               // store merged quadword conditional
+        beq     t0, retry3f             // if eq, retry failed interlock
+
+        addq    a0, 7, a0               // increment to next quad
+        bic     a0, 7, a0               // align next quadword
+
+//
+// Check for 64-byte blocks to move
+//
+
+100:
+        srl     a2, 6, t0               // t0 = number of blocks to move
+        beq     t0, 120f                // if eq no blocks to move
+        and     a2, 64-1, a2            // a2 = residual bytes to move
+
+        ldq_u   t1, 0(a1)               // t1 = first unaligned quad
+110:
+                                        // get source data and merge it
+                                        //  as we go
+        ldq_u   t2, 7(a1)               // t2 = second unaligned quad
+        extql   t1, a1, t1              // extract applicable bytes from t1
+        extqh   t2, a1, v0              // extract applicable bytes from t2
+        bis     t1, v0, t1              // t1 = quad #1
+        ldq_u   t3, 15(a1)              // t3 = third unaligned quad
+        extql   t2, a1, t2              // extract applicable bytes from t2
+        extqh   t3, a1, v0              // extract applicable bytes from t3
+        stq     t1, 0(a0)               // store quad #1
+        bis     t2, v0, t2              // t2 = quad #2
+        ldq_u   t4, 23(a1)              // t4 = fourth unaligned quad
+        extql   t3, a1, t3              // extract applicable bytes from t3
+        extqh   t4, a1, v0              // extract applicable bytes from t4
+        stq     t2, 8(a0)               // store quad #2
+        bis     t3, v0, t3              // t3 = quad #3
+        ldq_u   t5, 31(a1)              // t5 = fifth unaligned quad
+        extql   t4, a1, t4              // extract applicable bytes from t4
+        extqh   t5, a1, v0              // extract applicable bytes from t5
+        stq     t3, 16(a0)              // store quad #3
+        bis     t4, v0, t4              // t4 = quad #4
+        ldq_u   a3, 39(a1)              // a3 = sixth unaligned quad
+        extql   t5, a1, t5              // extract applicable bytes from t5
+        extqh   a3, a1, v0              // extract applicable bytes from a3
+        stq     t4, 24(a0)              // store quad #4
+        bis     t5, v0, t5              // t5 = quad #5
+        ldq_u   a4, 47(a1)              // a4 = seventh unaligned quad
+        extql   a3, a1, a3              // extract applicable bytes from a3
+        extqh   a4, a1, v0              // extract applicable bytes from a4
+        stq     t5, 32(a0)              // store quad #5
+        bis     a3, v0, a3              // a3 = quad #6
+        ldq_u   a5, 55(a1)              // a5 = eighth unaligned quad
+        extql   a4, a1, a4              // extract applicable bytes from a4
+        extqh   a5, a1, v0              // extract applicable bytes from a5
+        stq     a3, 40(a0)              // store quad #6
+        bis     a4, v0, a4              // a4 = quad #7
+        ldq_u   t1, 63(a1)              // t1 = ninth unaligned = 1st of next
+        extql   a5, a1, a5              // extract applicable bytes from a5
+        extqh   t1, a1, v0              // extract applicable bytes from t1
+        stq     a4, 48(a0)              // store quad #7
+        bis     a5, v0, a5              // a5 = quad #8
+        addq    a1, 64, a1              // increment source pointer
+        stq     a5, 56(a0)              // store quad #8
+        addq    a0, 64, a0              // increment destination pointer
+        subq    t0, 1, t0               // decrement number of blocks
+        bne     t0, 110b                // if ne, more blocks to move
+
+//
+// Move unaligned source quads to aligned destination quads
+//
+
+120:
+        srl     a2, 3, t0               // t0 = number of quads to move
+        beq     t0, 140f                // if eq no quads to move
+        and     a2, 8-1, a2             // a2 = residual bytes
+
+
+        ldq_u   t1, 0(a1)               // t1 = first unaligned quad
+130:
+        ldq_u   t2, 7(a1)               // t2 = second unaligned quad
+        addq    a0, 8, a0               // increment destination pointer
+        extql   t1, a1, t1              // extract applicable bytes from t1
+        extqh   t2, a1, v0              // extract applicable bytes from t2
+        bis     t1, v0, t1              // t1 = quadword of data
+        stq     t1, -8(a0)              // store data to destination
+        addq    a1, 8, a1               // increment source pointer
+        subq    t0, 1, t0               // decrement quads to move
+        bis     t2, zero, t1            // t1 = first of next unaligned pair
+        bne     t0, 130b                // if ne, more quads to move
+
+//
+// Move remaining bytes to final quadword
+//
+
+140:    
+        beq     a2, 160f                // if eq no more bytes to move
+
+        mov     zero, t3                // t3 = position for next insertion
+        mov     zero, t4                // assemble destination bytes here
+        mov     a2, t0                  // t0 = number of bytes to move
+        mov     -1, t1                  // t1 = bit mask
+        sll     t0, 3, t0               // # of bytes to # of bits
+        srl     t1, t0, t1              // clear t0 bits
+        sll     t1, t0, t0              // move it back
+        not     t0, t0                  // complement for destination clear mask
+150:
+        ldq_u   t1, 0(a1)               // get unaligned source quad
+        extbl   t1, a1, t1              // t1 = source byte
+        insbl   t1, t3, t1              // t1 = source byte, in position
+        bis     t4, t1, t4              // merge in source byte
+        addq    a1, 1, a1               // increment source pointer
+        subq    a2, 1, a2               // decrement bytes to move
+        addq    t3, 1, t3               // increment destination position
+        bne     a2, 150b                // more bytes to move
+retry4:
+        ldq_l   t2, 0(a0)               // get last destination quadword locked
+        bic     t2, t0, t2              // clear bytes to be copied
+        bis     t2, t4, t2              // move bytes from source
+        stq_c   t2, 0(a0)               // store merged quadword conditional
+        beq     t2, retry4f             // if eq, retry failed interlock
+           
+//
+// Finish unaligned MoveForward
+//
+
+160:
+        ret     zero, (ra)              // return
+
+//
+// Out of line branches for failed store conditional.
+// Don't need to restore anything, just try again.
+//
+
+retry1f:
+        br      retry1
+retry2f:
+        br      retry2
+retry3f:
+        br      retry3
+retry4f:
+        br      retry4
+
+        .end    RtlCopyBytes
+
+        SBTTL("Zero Bytes")
+//++
+//
+// VOID
+// RtlZeroBytes (
+//    IN PVOID Destination,
+//    IN ULONG Length
+//    )
+//
+// Routine Description:
+//
+//    This function zeros memory by first aligning the destination address to
+//    a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte
+//    blocks, followed by any remaining bytes. Unlike RtlZeroMemory the copy is
+//    done such that byte granularity is assured for all platforms.
+//
+// Arguments:
+//
+//    Destination (a0) - Supplies a pointer to the memory to zero.
+//
+//    Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
+//
+// Return Value:
+//
+//    None.
+//
+//--
+
+        LEAF_ENTRY(RtlZeroBytes)
+
+        bis     zero, zero, a2          // set fill pattern
+        br      zero, RtlpFillBytes     //
+
+
+        SBTTL("Fill Bytes")
+//++
+//
+// VOID
+// RtlFillBytes (
+//    IN PVOID Destination,
+//    IN ULONG Length,
+//    IN UCHAR Fill
+//    )
+//
+// Routine Description:
+//
+//    This function fills memory by first aligning the destination address to
+//    a longword boundary, and then filling 32-byte blocks, followed by 4-byte
+//    blocks, followed by any remaining bytes. Unlike RtlFillMemory the copy is
+//    done such that byte granularity is assured for all platforms.
+//
+// Arguments:
+//
+//    Destination (a0) - Supplies a pointer to the memory to fill.
+//
+//    Length (a1) - Supplies the length, in bytes, of the memory to be filled.
+//
+//    Fill (a2) - Supplies the fill byte.
+//
+//    N.B. The alternate entry memset expects the length and fill arguments
+//         to be reversed.  It also returns the Destination pointer
+//
+// Return Value:
+//
+//    None.
+//
+//--
+
+        ALTERNATE_ENTRY(RtlFillBytes)
+
+        and     a2, 0xff, a2            // clear excess bits
+        sll     a2, 8, t0               // duplicate fill byte
+        bis     a2, t0, a2              // generate fill word
+        sll     a2, 16, t0              // duplicate fill word
+        bis     a2, t0, a2              // generate fill longword
+        sll     a2, 32, t0              // duplicate fill longword
+        bis     a2, t0, a2              // generate fill quadword
+
+.align 3                                // ensure quadword aligned target
+//
+// Fill memory with the pattern specified in register a2.
+//
+
+RtlpFillBytes:                          //
+
+//
+// Align destination to quadword
+//
+
+        beq     a1, 80f                 // anything to fill? (paranoia)
+        and     a0, 8-1, t0             // t0 = unaligned bits
+        bne     t0, 5f                  // if ne, then not quad aligned
+        br      zero, 20f               // if eq, then quad aligned
+
+5:
+        bis     zero, zero, t1          // t4 = destination byte zap mask
+        bis     zero, 1, t2
+        sll     t2, t0, t2              // t2 = next bit to set in zap mask
+10:
+        beq     a1, 15f                 // if eq, all bits set
+        bis     t1, t2, t1              // set bit in zap mask
+        sll     t2, 1, t2               // set next higher bit for zap mask
+        subq    a1, 1, a1               // decrement bytes to fill
+        addq    t0, 1, t0               // increment byte within quad
+        cmpeq   t0, 8, t3               // finished the quadword?
+        beq     t3, 10b                 // if eq [false], do next byte
+15:
+        zapnot  a2, t1, t2              // clear fill bytes
+        bic     a0, 7, a3               // a3 = quadword base of destination
+retry5:
+        ldq_l   t0, 0(a3)               // load destination quadword
+        zap     t0, t1, t0              // clear destination bytes
+        or      t0, t2, t0              // merge in fill bytes
+        stq_c   t0, 0(a3)               // store merged quadword conditional
+        beq     t0, retry5f            // if eq, retry failed interlock
+
+        addq    a0, 7, a0               // move a0 to next quadword
+        bic     a0, 7, a0               // align a0 to quadword
+
+//
+// Check for 64-byte blocks
+//
+
+20:
+        srl     a1, 6, t0               // t0 = number of 64 byte blocks
+        beq     t0, 40f                 // if eq then no 64 byte blocks
+        and     a1, 64-1, a1            // a1 = residual bytes to fill
+
+30:
+        stq     a2, 0(a0)               // store 64 bytes
+        stq     a2, 8(a0)               //
+        stq     a2, 16(a0)              //
+        stq     a2, 24(a0)              //
+        stq     a2, 32(a0)              //
+        stq     a2, 40(a0)              //
+        stq     a2, 48(a0)              //
+        stq     a2, 56(a0)              //
+
+        subq    t0, 1, t0               // decrement blocks remaining
+        addq    a0, 64, a0              // increment destination pointer
+        bne     t0, 30b                 // more blocks to write
+
+
+
+//
+// Fill aligned quadwords
+//
+
+40:
+        srl     a1, 3, t0               // t0 = number of quadwords
+        bne     t0, 55f                 // if ne quadwords left to fill
+        br      zero, 60f               // if eq no quadwords left
+
+55:
+        and     a1, 8-1, a1             // a1 = residual bytes to fill
+
+50:
+        stq     a2, 0(a0)               // store quadword
+        subq    t0, 1, t0               // decrement quadwords remaining
+        addq    a0, 8, a0               // next quadword
+        bne     t0, 50b                 // more quadwords to write
+
+//
+// Fill bytes for last quadword
+//
+
+60:
+        beq     a1, 80f                 // if eq no more bytes to fill
+
+        mov     a1, t0                  // t0 = number of bytes to move
+        mov     -1, t1                  // t1 = bit mask
+        sll     t0, 3, t0               // # of bytes to # of bits
+        srl     t1, t0, t1              // clear t0 bits
+        sll     t1, t0, t0              // move it back
+        bic     a2, t0, t1              // clear fill bytes not copied
+        not     t0, t0                  // complement to clear destination
+retry6:
+        ldq_l   t2, 0(a0)               // get last destination quadword locked
+        bic     t2, t0, t2              // clear bytes to be copied
+        bis     t2, t1, t2              // move bytes from source
+        stq_c   t2, 0(a0)               // store merged quadword conditional
+        beq     t2, retry6f             // if eq, retry failed interlock
+
+//
+// Finish up
+//
+
+80:
+        ret     zero, (ra)              // return
+
+//
+// Out of line branches for failed store conditional.
+// Don't need to restore anything, just try again.
+//
+
+retry5f:
+        br      retry5
+retry6f:
+        br      retry6
+
+        .end    RtlZeroBytes
author	Adam <you@example.com>	2020-05-17 05:51:50 +0200
committer	Adam <you@example.com>	2020-05-17 05:51:50 +0200
commit	e611b132f9b8abe35b362e5870b74bce94a1e58e (patch)
tree	a5781d2ec0e085eeca33cf350cf878f2efea6fe5 /private/ntos/rtl/alpha/mvmem.s
download	NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.gz NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.bz2 NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.lz NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.xz NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.zst NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.zip