summaryrefslogtreecommitdiffstats
path: root/private/ntos/rtl/alpha/mvmem.s
diff options
context:
space:
mode:
Diffstat (limited to 'private/ntos/rtl/alpha/mvmem.s')
-rw-r--r--private/ntos/rtl/alpha/mvmem.s1920
1 files changed, 1920 insertions, 0 deletions
diff --git a/private/ntos/rtl/alpha/mvmem.s b/private/ntos/rtl/alpha/mvmem.s
new file mode 100644
index 000000000..c5ccc9a81
--- /dev/null
+++ b/private/ntos/rtl/alpha/mvmem.s
@@ -0,0 +1,1920 @@
+// TITLE("Compare, Move, Zero, and Fill Memory Support")
+//++
+//
+// Copyright (c) 1992 Digital Equipment Corporation
+//
+// Module Name:
+//
+// mvmem.s
+//
+// Abstract:
+//
+// This module implements functions to compare, move, zero, and fill
+// blocks of memory. If the memory is aligned, then these functions
+// are very efficient.
+//
+// N.B. These routines MUST preserve all floating state since they are
+// frequently called from interrupt service routines that normally
+// do not save or restore floating state.
+//
+// Author:
+//
+// Joe Notarangelo 21-May-1992
+//
+// Environment:
+//
+// User or Kernel mode.
+//
+// Revision History:
+//
+// Monty VanderBilt 14-Feb-1996 Avoid memory loads and branch takens between
+// load lock and store conditional instructions
+// to conform with all alpha architecture rules.
+// Monty VanderBilt 27-Feb-1996 Added RtlZeroBytes and RtlFillBytes to support
+// byte granularity access when necessary.
+//--
+
+#include "ksalpha.h"
+
+ SBTTL("Compare Memory")
+//++
+//
+// ULONG
+// RtlCompareMemory (
+// IN PVOID Source1,
+// IN PVOID Source2,
+// IN ULONG Length
+// )
+//
+// Routine Description:
+//
+// This function compares two blocks of memory and returns the number
+// of bytes that compared equal.
+//
+// Arguments:
+//
+// Source1 (a0) - Supplies a pointer to the first block of memory to
+// compare.
+//
+// Source2 (a1) - Supplies a pointer to the second block of memory to
+// compare.
+//
+// Length (a2) - Supplies the length, in bytes, of the memory to be
+// compared.
+//
+// Return Value:
+//
+// The number of bytes that compared equal is returned as the function
+// value. If all bytes compared equal, then the length of the orginal
+// block of memory is returned.
+//
+//--
+
+
+ LEAF_ENTRY(RtlCompareMemory)
+
+ bis a2, zero, v0 // save length of comparison
+ beq a2, 90f // (JAE) quit if nothing to compare
+ xor a0, a1, t0 // check for compatible alignment
+ and t0, 0x7, t0 // low bits only
+ bne t0, CompareUnaligned // if ne, incompatible alignment
+
+//
+// Compare memory aligned
+//
+
+CompareAligned: //
+
+//
+// compare memory until sources are aligned
+//
+ and a0, 0x7, t0 // get low bits
+ bne t0, 10f // if ne, sources not aligned yet
+ br zero, 30f // already aligned, predicted
+
+
+10:
+ ldq_u t1, 0(a0) // get unaligned quad at source 1
+ ldq_u t2, 0(a1) // get unaligned quad at source 2
+
+20:
+ extbl t1, t0, t4 // byte at t0 in source 1 quad
+ extbl t2, t0, t5 // byte at t0 in source 2 quad
+ xor t4, t5, t3 // t1 = t2 ?
+ bne t3, 110f // not equal, miscompare
+ subq a2, 1, a2 // decrement bytes to compare
+ beq a2, 90f // if eq, compare success
+ addq t0, 1, t0 // increment pointer within quad
+ cmpeq t0, 8, t3 // t0 = 8?, if so first quadword done
+ beq t3, 20b // continue while t0 < 8
+
+
+ addq a0, 8, a0 // increment to next quadword
+ addq a1, 8, a1 // increment source 2 to next also
+ bic a0, 7, a0 // align source 1 quadword
+ bic a1, 7, a1 // align source 2 quadword
+
+
+//
+// aligned block compare, compare blocks of 64 bytes
+//
+
+30:
+ srl a2, 6, t0 // t0 = number of 64 byte blocks
+ beq t0, 50f // if eq, no 64 byte blocks
+
+//
+// N.B. loads from each of the sources were separated in case these
+// blocks are fighting for the cache
+//
+ .set noat
+40:
+ ldq t1, 0(a0) // t1 = source 1, quad 0
+ ldq t2, 8(a0) // t2 = source 1, quad 1
+ ldq t3, 16(a0) // t3 = source 1, quad 2
+ addq a1, 64, a1 // increment source 2 pointer
+ ldq t4, 24(a0) // t4 = source 1, quad 3
+
+ ldq t5, -64(a1) // t5 = source 2, quad 0
+ ldq a4, -56(a1) // a4 = source 2, quad 1
+ ldq a5, -48(a1) // a5 = source 2, quad 2
+ xor t1, t5, $at // quad 0 match?
+ bne $at, 200f // if ne[false], miscompare
+ ldq t5, -40(a1) // t5 = source 2, quad 3
+ ldq t1, 32(a0) // t1 = source 1, quad 4
+ xor t2, a4, $at // quad 1 match?
+ bne $at, 122f // if ne[false], miscompare
+ ldq t2, 40(a0) // t2 = source 1, quad 5
+ xor t3, a5, $at // quad 2 match?
+ bne $at, 124f // if ne[false], miscompare
+ ldq t3, 48(a0) // t3 = source 1, quad 6
+ xor t4, t5, $at // quad 3 match?
+ bne $at, 126f // if ne[false], miscompare
+ ldq t4, 56(a0) // t4 = source 1, quad 7
+
+ ldq t5, -32(a1) // t5 = source 2, quad 4
+ addq a0, 64, a0 // increment source 1 pointer
+ ldq a4, -24(a1) // a4 = source 2, quad 5
+ subq t0, 1, t0 // decrement blocks to compare
+ ldq a5, -16(a1) // a5 = source 2, quad 6
+ xor t1, t5, $at // quad 4 match?
+ bne $at, 130f // if ne[false], miscompare
+ ldq t5, -8(a1) // t5 = source 2, quad 7
+ xor t2, a4, $at // quad 5 match?
+ bne $at, 132f // if ne[false], miscompare
+ xor t3, a5, $at // quad 6 match?
+ bne $at, 134f // if ne[false], miscompare
+ xor t4, t5, $at // quad 7 match?
+ bne $at, 136f // if ne[false], miscompare
+ subq a2, 64, a2 // decrement bytes to compare
+ bne t0, 40b // if ne, more blocks to compare
+ .set at
+
+
+//
+// Compare quadwords
+//
+
+50:
+ srl a2, 3, t0 // t0 = number of quadwords to compare
+ beq t0, 70f // if eq, no quadwords to compare
+
+ .set noat
+60:
+ ldq t1, 0(a0) // t1 = quad from source 1
+ lda a0, 8(a0) // increment source 1 pointer
+ ldq t2, 0(a1) // t2 = quad from source 2
+ lda a1, 8(a1) // increment source 2 pointer
+ xor t1, t2, $at // are quadwords equal?
+ bne $at, 200f // if ne, miscompare
+ subq t0, 1, t0 // decrement quads to compare
+ subq a2, 8, a2 // decrement bytes to compare
+ bne t0, 60b // if ne, more quads to compare
+
+ .set at
+
+//
+// Compare bytes in last quadword
+//
+
+// a2 = number of bytes to compare, less than 8, greater than zero
+// a0, a1, quad-aligned to last quadword
+
+ beq a2, 80f // if eq, all bytes compared
+
+ .set noat
+70:
+ ldq t1, 0(a0) // t1 = quad at source 1
+ ldq t2, 0(a1) // t2 = quad at source 2
+ bis zero, 0xff, t0 // zap mask
+ sll t0, a2, t0 //
+ zap t1, t0, t1 // zero bytes not compared
+ zap t2, t0, t2 // same for source 2
+ xor t1, t2, $at // compare quadwords
+ bne $at, 200f // if ne, miscompare
+
+ .set at
+//
+// Successful compare
+// v0 already contains full length
+//
+
+80:
+ ret zero, (ra) // return
+
+
+//
+// Sources have incompatible alignment
+//
+CompareUnaligned:
+
+
+//
+// Compare until source 1 (a0) is aligned
+//
+
+ and a0, 0x7, t0 // get byte position of pointer
+ beq t0, 30f // if eq, already aligned
+
+ ldq_u t1, 0(a0) // get unaligned quad at a0
+
+10:
+ ldq_u t2, 0(a1) // get unaligned quad at a1
+ extbl t1, t0, t4 // get byte to compare from source 1
+ extbl t2, a1, t2 // get byte to compare from source 2
+ xor t4, t2, t3 // do bytes match?
+ bne t3, 110f // if ne, miscompare
+ subq a2, 1, a2 // decrement bytes to compare
+ beq a2, 90f // (JAE) quit if nothing left to compare
+ addq t0, 1, t0 // increment byte within source 1
+ addq a1, 1, a1 // increment source 2 pointer
+ cmpeq t0, 8, t3 // finished with source 1 quad?
+ beq t3, 10b // if eq[false], more to compare
+
+ addq a0, 7, a0 // point to next source 1 quad
+ bic a0, 7, a0 // align to quadword
+
+
+//
+// Compare 64-byte blocks
+//
+
+30:
+ srl a2, 6, t0 // t0 = number of blocks to compare
+ beq t0, 50f // if eq, no blocks to move
+
+ ldq_u t1, 0(a1) // get source 2 unaligned quad 1
+
+ .set noat
+40:
+ ldq_u t2, 7(a1) // get source 2 unaligned quad 2
+ addq a0, 64, a0 // increment source 1 pointer
+ ldq_u t3, 15(a1) // get source 2 unaligned quad 3
+ extql t1, a1, t1 // bytes from unaligned quad 1
+ extqh t2, a1, $at // bytes from unaligned quad 2
+ ldq_u t4, 23(a1) // get source 2 unaligned quad 4
+ bis t1, $at, t1 // t1 = quadword 1 (source 2)
+ ldq_u t5, 31(a1) // get source 2 unaligned quad 5
+ extql t2, a1, t2 // bytes from unaligned quad 2
+ extqh t3, a1, $at // bytes from unaligned quad 3
+ ldq a3, -64(a0) // a3 = quadword 1 (source 1)
+ bis t2, $at, t2 // t2 = quadword 2 (source 2)
+ ldq a4, -56(a0) // a4 = quadword 2 (source 1)
+ extql t3, a1, t3 // bytes from unaligned quad 3
+ extqh t4, a1, $at // bytes from unaligned quad 4
+ ldq a5, -48(a0) // a5 = quadword 3 (source 1)
+ bis t3, $at, t3 // t3 = quadword 3 (source 2)
+ extql t4, a1, t4 // bytes from unaligned quad 4
+ extqh t5, a1, $at // bytes from unaligned quad 5
+ subq t0, 1, t0 // decrement blocks to compare
+ bis t4, $at, t4 // t4 = quadword 4 (source 2)
+
+ xor t1, a3, $at // match on quadword 1?
+ ldq a3, -40(a0) // a3 = quadword 4 (source 1)
+ bne $at, 200f // if ne, miscompare quad 1
+ xor t2, a4, $at // match on quadword 2?
+ ldq_u t2, 39(a1) // get source 2 unaligned quad 6
+ bne $at, 122f // if ne, miscompare quad 2
+ xor t3, a5, $at // match on quadword 3?
+ ldq_u t3, 47(a1) // get source 2 unaligned quad 7
+ bne $at, 124f // if ne, miscompare quad 3
+ xor t4, a3, $at // match on quadword 4?
+ ldq_u t4, 55(a1) // get source 2 unaligned quad 8
+ bne $at, 126f // if ne, miscompare quad 4
+ ldq_u t1, 63(a1) // get source 2 unaligned quad 9
+
+ ldq a3, -32(a0) // a3 = quadword 5 (source 1)
+ extql t5, a1, t5 // bytes from unaligned quad 5
+ extqh t2, a1, $at // bytes from unaligned quad 6
+ ldq a4, -24(a0) // a4 = quadword 6 (source 1)
+ ldq a5, -16(a0) // a5 = quadword 7 (source 1)
+ bis t5, $at, t5 // t5 = quadword 5 (source 2)
+
+ xor t5, a3, $at // match on quadword 5?
+ ldq a3, -8(a0) // a3 = quadword 8 (source 1)
+ bne $at, 130f // if ne, miscompare quad 5
+ extql t2, a1, t2 // bytes from unaligned quad 6
+ extqh t3, a1, $at // bytes from unaligned quad 7
+ extql t3, a1, t3 // bytes from unaligned quad 7
+ bis t2, $at, t2 // t2 = quadword 6 (source 2)
+ xor t2, a4, $at // match on quadword 6?
+ bne $at, 132f // if ne, miscompare quad 6
+ extqh t4, a1, $at // bytes from unaligned quad 8
+ extql t4, a1, t4 // bytes from unaligned quad 8
+ bis t3, $at, t3 // t3 = quadword 7 (source 2)
+ xor t3, a5, $at // match on quadword 7?
+ bne $at, 134f // if ne, miscompare quad 7
+ extqh t1, a1, $at // bytes from unaligned quad 9
+ addq a1, 64, a1 // increment source 2 pointer
+ bis t4, $at, t4 // t4 = quadword 8 (source 2)
+ xor t4, a3, $at // match on quadword 8?
+ bne $at, 136f // if ne, miscompare quad 8
+ subq a2, 64, a2 // decrement number of bytes to compare
+ bne t0, 40b // if ne, more blocks to compare
+
+ .set at
+
+//
+// Compare quadwords
+//
+
+
+50:
+ srl a2, 3, t0 // t0 = number of quads to compare
+ beq t0, 70f // if eq, no quads to compare
+ ldq_u t1, 0(a1) // get unaligned quad 1 (source 2)
+
+ .set noat
+60:
+ ldq_u t2, 7(a1) // get unaligned quad 2 (source 2)
+ ldq t3, 0(a0) // t3 = quadword 1 (source 1)
+ extql t1, a1, t1 // get bytes from unaligned quad 1
+ extqh t2, a1, $at // get bytes from unaligned quad 2
+ addq a1, 8, a1 // increment source 2 pointer
+ bis t1, $at, t1 // t1 = quadword 1 (source 2)
+ xor t1, t3, $at // match on quadword?
+ bne $at, 200f // if ne, miscompare
+ subq t0, 1, t0 // decrement quadwords to compare
+ addq a0, 8, a0 // increment source 1 pointer
+ subq a2, 8, a2 // decrement bytes to compare
+ bis t2, zero, t1 // save low quadword for next loop
+ bne t0, 60b // if ne, more quads to compare
+
+ .set at
+
+//
+// Compare bytes for final quadword
+//
+
+70:
+ beq a2, 90f // if eq, comparison complete
+
+ ldq t1, 0(a0) // get quadword from source 1
+ bis zero, zero, t0 // t0 = byte position to compare
+
+ .set noat
+80:
+ ldq_u t2, 0(a1) // get unaligned quad from source 2
+ extbl t1, t0, t3 // t3 = byte from source 1
+ extbl t2, a1, t2 // t2 = byte from source 2
+ xor t3, t2, $at // match on byte?
+ bne $at, 100f // if ne, miscompare on byte
+ addq t0, 1, t0 // increment byte position
+ addq a1, 1, a1 // increment source 2 pointer
+ subq a2, 1, a2 // decrement bytes to compare
+ bne a2, 80b // if ne, more bytes to compare
+
+ .set at
+//
+// Successful full comparison
+//
+
+90:
+ ret zero, (ra) // return, v0 already set
+
+
+//
+// Miscompare on last quadword
+//
+
+100:
+ subq v0, a2, v0 // subtract bytes not compared
+ ret zero, (ra) // return
+
+//
+// Miscompare on first quadword, unaligned case
+//
+// v0 = total bytes to compare
+// a2 = bytes remaining to compare
+//
+
+110:
+ subq v0, a2, v0 // bytes compared successfully
+ ret zero, (ra) // return
+
+//
+// Miscompare on 64-byte block compare
+//
+
+122:
+ subq a2, 8, a2 // miscompare on quad 2
+ br zero, 200f // finish in common code
+
+124:
+ subq a2, 16, a2 // miscompare on quad 3
+ br zero, 200f // finish in common code
+
+126:
+ subq a2, 24, a2 // miscompare on quad 4
+ br zero, 200f // finish in common code
+
+130:
+ subq a2, 32, a2 // miscompare on quad 5
+ br zero, 200f // finish in common code
+
+132:
+ subq a2, 40, a2 // miscompare on quad 6
+ br zero, 200f // finish in common code
+
+134:
+ subq a2, 48, a2 // miscompare on quad 7
+ br zero, 200f // finish in common code
+
+136:
+ subq a2, 56, a2 // miscompare on quad 8
+ br zero, 200f // finish in common code
+
+//
+// Miscompare, determine number of bytes that successfully compared
+// $at = xor of relevant quads from sources, must be non-zero
+// a2 = number of bytes left to compare
+//
+ .set noat
+200:
+ cmpbge zero, $at, $at // $at = mask of non-zero bytes
+
+ //
+ // look for the first bit cleared in $at, this is the
+ // number of the first byte which differed
+ //
+ bis zero, zero, t0 // bit position to look for clear
+
+210:
+ blbc $at, 220f // if low clear, found difference
+ srl $at, 1, $at // check next bit
+ addq t0, 1, t0 // count bit position checked
+ br zero, 210b
+
+220:
+ subq v0, a2, v0 // subtract bytes yet to compare
+ addq v0, t0, v0 // add bytes that matched on last quad
+
+ ret zero, (ra)
+
+ .set at
+
+ .end RtlCompareMemory
+
+
+
+ SBTTL("Move Memory")
+//++
+//
+// VOID
+// RtlMoveMemory (
+// IN PVOID Destination,
+// IN PVOID Source,
+// IN ULONG Length
+// )
+//
+// Routine Description:
+//
+// This function moves memory either forward or backward, aligned or
+// unaligned, in 64-byte blocks, followed by 8-byte blocks, followed
+// by any remaining bytes.
+//
+// Arguments:
+//
+// Destination (a0) - Supplies a pointer to the destination address of
+// the move operation.
+//
+// Source (a1) - Supplies a pointer to the source address of the move
+// operation.
+//
+// Length (a2) - Supplies the length, in bytes, of the memory to be moved.
+//
+// Return Value:
+//
+// None.
+//
+//--
+
+ LEAF_ENTRY(RtlMoveMemory)
+
+ beq a2, 80f // if eq, no bytes to move
+//
+// If the source address is less than the destination address and source
+// address plus the length of the move is greater than the destination
+// address, then the source and destination overlap such that the move
+// must be performed backwards.
+//
+
+ cmpult a0, a1, t0 // is destination less than source
+ bne t0, MoveForward // if eq [true] no overlap possible
+ addq a1, a2, t0 // compute source ending address
+ cmpult t0, a0, t1 // is source end less than dest.
+ beq t1, MoveBackward // if eq [false], overlap
+
+//
+// Move memory forward aligned and unaligned.
+//
+
+MoveForward: //
+ xor a0, a1, t0 // compare alignment bits
+ and t0, 0x7, t0 // isloate alignment comparison
+ bne t0, MoveForwardUnaligned // if ne, incompatible alignment
+
+//
+// Move memory forward aligned.
+//
+
+MoveForwardAligned: //
+
+//
+// Move bytes until source and destination are quadword aligned
+//
+
+ and a0, 0x7, t0 // t0 = unaligned bits
+ bne t0, 5f // if ne, not quad aligned
+ br zero, 20f // predicted taken
+
+5:
+ ldq_u t2, 0(a0) // get unaligned quad from dest.
+ ldq_u t1, 0(a1) // get unaligned quadword from source
+10:
+ beq a2, 15f // if eq, all bytes moved
+ extbl t1, t0, t3 // t3 = byte from source
+ insbl t3, t0, t3 // t3 = byte from source, in position
+ mskbl t2, t0, t2 // clear position in dest. quad
+ bis t2, t3, t2 // merge in byte from source
+ subq a2, 1, a2 // decrement bytes to move
+ addq t0, 1, t0 // increment byte within quad
+ cmpeq t0, 8, t3 // finished the quadword?
+ beq t3, 10b // if eq [false], do next byte
+15:
+ stq_u t2, 0(a0) // store merged destination bytes
+
+ addq a0, 7, a0 // move to next quadword
+ bic a0, 7, a0 // aligned quadword
+
+ addq a1, 7, a1 // move to next quadword
+ bic a1, 7, a1 // aligned quadword
+
+//
+// Check for 64-byte block moves
+//
+
+20:
+ srl a2, 6, t0 // t0 = number of 64 byte blocks
+ beq t0, 40f // if eq no blocks to move
+ and a2, 64-1, a2 // a2 = residual bytes
+
+30:
+ ldq t1, 0(a1) // load 64 bytes from source
+ addq a0, 64, a0 // increment destination pointer
+ ldq v0, 56(a1) //
+ ldq a3, 32(a1) //
+ stq t1, -64(a0) // write to destination
+ ldq t2, 8(a1) // into volatile registers
+ ldq t3, 16(a1) //
+ ldq t4, 24(a1) //
+ subq t0, 1, t0 // decrement number of blocks
+ stq t2, -56(a0) //
+ ldq a4, 40(a1) //
+ stq t3, -48(a0) //
+ ldq a5, 48(a1) //
+ stq t4, -40(a0) //
+ addq a1, 64, a1 // increment source pointer
+ stq a3, -32(a0) //
+ stq a4, -24(a0) //
+ stq a5, -16(a0) //
+ stq v0, -8(a0) //
+ bne t0, 30b // if ne, more blocks to copy
+
+//
+// Copy quadwords
+//
+
+40:
+ srl a2, 3, t0 // t0 = number of quadwords to move
+ beq t0, 60f // if eq no quadwords to move
+ and a2, 8-1, a2 // a2 = residual bytes
+
+50:
+ ldq t1, 0(a1) // load quadword from source
+ addq a1, 8, a1 // increment source pointer
+ stq t1, 0(a0) // store quadword to destination
+ addq a0, 8, a0 // increment destination pointer
+ subq t0, 1, t0 // decrement number of quadwords
+ bne t0, 50b // if ne, more quadwords to move
+
+//
+// Move final residual bytes
+//
+
+60:
+ beq a2, 80f // if eq, no more bytes to move
+ ldq t1, 0(a1) // get last source quadword
+ ldq t2, 0(a0) // get last dest. quadword
+ bis zero, zero, t0 // t0 = next byte number to move
+
+70:
+ extbl t1, t0, t3 // extract byte from source
+ insbl t3, t0, t3 // t3 = source byte, in position
+ mskbl t2, t0, t2 // clear byte position for dest.
+ bis t2, t3, t2 // merge in source byte
+ addq t0, 1, t0 // increment byte position
+ subq a2, 1, a2 // decrement bytes to move
+ bne a2, 70b // if ne => more bytes to move
+
+ stq t2, 0(a0) // store merged data
+
+//
+// Finish aligned MoveForward
+//
+
+80:
+ ret zero, (ra) // return
+
+
+
+//
+// Move memory forward unaligned.
+//
+
+MoveForwardUnaligned: //
+
+
+//
+// Move bytes until the destination is aligned
+//
+
+ and a0, 0x7, t0 // t0 = unaligned bits
+ beq t0, 100f // if eq, destination quad aligned
+
+ ldq_u t2, 0(a0) // get unaligned quad from dest
+
+90:
+ beq a2, 95f // if eq no more bytes to move
+ ldq_u t1, 0(a1) // get unaligned quad from source
+ extbl t1, a1, t1 // extract source byte
+ insbl t1, t0, t1 // t1 = source byte, in position
+ mskbl t2, t0, t2 // clear byte position in dest.
+ bis t2, t1, t2 // merge in source byte
+ addq t0, 1, t0 // increment byte position
+ addq a1, 1, a1 // increment source pointer
+ subq a2, 1, a2 // decrement bytes to move
+ cmpeq t0, 8, t3 // t0 = 8? => quad finished
+ beq t3, 90b // if eq [false], more bytes to move
+95:
+ stq_u t2, 0(a0) // store merged quadword
+ addq a0, 7, a0 // increment to next quad
+ bic a0, 7, a0 // align next quadword
+
+//
+// Check for 64-byte blocks to move
+//
+
+100:
+ srl a2, 6, t0 // t0 = number of blocks to move
+ beq t0, 120f // if eq no blocks to move
+ and a2, 64-1, a2 // a2 = residual bytes to move
+
+
+ ldq_u t1, 0(a1) // t1 = first unaligned quad
+
+110:
+ // get source data and merge it
+ // as we go
+ ldq_u t2, 7(a1) // t2 = second unaligned quad
+ extql t1, a1, t1 // extract applicable bytes from t1
+ extqh t2, a1, v0 // extract applicable bytes from t2
+ bis t1, v0, t1 // t1 = quad #1
+ ldq_u t3, 15(a1) // t3 = third unaligned quad
+ extql t2, a1, t2 // extract applicable bytes from t2
+ extqh t3, a1, v0 // extract applicable bytes from t3
+ stq t1, 0(a0) // store quad #1
+ bis t2, v0, t2 // t2 = quad #2
+ ldq_u t4, 23(a1) // t4 = fourth unaligned quad
+ extql t3, a1, t3 // extract applicable bytes from t3
+ extqh t4, a1, v0 // extract applicable bytes from t4
+ stq t2, 8(a0) // store quad #2
+ bis t3, v0, t3 // t3 = quad #3
+ ldq_u t5, 31(a1) // t5 = fifth unaligned quad
+ extql t4, a1, t4 // extract applicable bytes from t4
+ extqh t5, a1, v0 // extract applicable bytes from t5
+ stq t3, 16(a0) // store quad #3
+ bis t4, v0, t4 // t4 = quad #4
+ ldq_u a3, 39(a1) // a3 = sixth unaligned quad
+ extql t5, a1, t5 // extract applicable bytes from t5
+ extqh a3, a1, v0 // extract applicable bytes from a3
+ stq t4, 24(a0) // store quad #4
+ bis t5, v0, t5 // t5 = quad #5
+ ldq_u a4, 47(a1) // a4 = seventh unaligned quad
+ extql a3, a1, a3 // extract applicable bytes from a3
+ extqh a4, a1, v0 // extract applicable bytes from a4
+ stq t5, 32(a0) // store quad #5
+ bis a3, v0, a3 // a3 = quad #6
+ ldq_u a5, 55(a1) // a5 = eighth unaligned quad
+ extql a4, a1, a4 // extract applicable bytes from a4
+ extqh a5, a1, v0 // extract applicable bytes from a5
+ stq a3, 40(a0) // store quad #6
+ bis a4, v0, a4 // a4 = quad #7
+ ldq_u t1, 63(a1) // t1 = ninth unaligned = 1st of next
+ extql a5, a1, a5 // extract applicable bytes from a5
+ extqh t1, a1, v0 // extract applicable bytes from t1
+ stq a4, 48(a0) // store quad #7
+ bis a5, v0, a5 // a5 = quad #8
+ addq a1, 64, a1 // increment source pointer
+ stq a5, 56(a0) // store quad #8
+ addq a0, 64, a0 // increment destination pointer
+ subq t0, 1, t0 // decrement number of blocks
+ bne t0, 110b // if ne, more blocks to move
+
+//
+// Move unaligned source quads to aligned destination quads
+//
+
+120:
+ srl a2, 3, t0 // t0 = number of quads to move
+ beq t0, 140f // if eq no quads to move
+ and a2, 8-1, a2 // a2 = residual bytes
+
+
+ ldq_u t1, 0(a1) // t1 = first unaligned quad
+130:
+ ldq_u t2, 7(a1) // t2 = second unaligned quad
+ addq a0, 8, a0 // increment destination pointer
+ extql t1, a1, t1 // extract applicable bytes from t1
+ extqh t2, a1, v0 // extract applicable bytes from t2
+ bis t1, v0, t1 // t1 = quadword of data
+ stq t1, -8(a0) // store data to destination
+ addq a1, 8, a1 // increment source pointer
+ subq t0, 1, t0 // decrement quads to move
+ bis t2, zero, t1 // t1 = first of next unaligned pair
+ bne t0, 130b // if ne, more quads to move
+
+//
+// Move remaining bytes to final quadword
+//
+
+
+140:
+ beq a2, 160f // if eq no more bytes to move
+ ldq t2, 0(a0) // t2 = destination quadword
+ bis zero, zero, t3 // t3 = position for next insertion
+
+150:
+ ldq_u t1, 0(a1) // get unaligned source quad
+ extbl t1, a1, t1 // t1 = source byte
+ insbl t1, t3, t1 // t1 = source byte, in position
+ mskbl t2, t3, t2 // clear byte in destination
+ bis t2, t1, t2 // merge in source byte
+ addq a1, 1, a1 // increment source pointer
+ subq a2, 1, a2 // decrement bytes to move
+ addq t3, 1, t3 // increment destination position
+ bne a2, 150b // more bytes to move
+
+ stq t2, 0(a0) // store merged data
+
+//
+// Finish unaligned MoveForward
+//
+
+160:
+ ret zero, (ra) // return
+
+
+//
+// Move memory backward.
+//
+
+MoveBackward: //
+
+ addq a0, a2, a0 // compute ending destination address
+ addq a1, a2, a1 // compute ending source address
+ subq a0, 1, a0 // point to last destination byte
+ subq a1, 1, a1 // point to last source byte
+ xor a0, a1, t0 // compare alignment bits
+ and t0, 0x7, t0 // isolate alignment comparison
+ bne t0, MoveBackwardUnaligned // if ne, incompatible alignment
+
+//
+// Move memory backward aligned.
+//
+
+MoveBackwardAligned: //
+
+//
+// Move bytes until source and destination are quadword aligned
+//
+
+ and a0, 0x7, t0 // t0 = unaligned bits
+ cmpeq t0, 7, t1 // last byte position 7?
+ beq t1, 5f // if eq [false], not quad aligned
+ subq a0, 7, a0 // point to beginning of last quad
+ subq a1, 7, a1 // point to beginning of last quad
+ br zero, 30f // predicted taken
+
+5:
+ ldq_u t1, 0(a0) // get unaligned quad from dest.
+ ldq_u t2, 0(a1) // get unaligned quad from source
+
+10:
+ beq a2, 20f // if eq, all bytes moved
+ extbl t2, t0, t3 // t3 = byte from source
+ insbl t3, t0, t3 // t3 = byte from source, in position
+ mskbl t1, t0, t1 // clear position in destination
+ bis t1, t3, t1 // merge in byte from source
+ subq a2, 1, a2 // decrement bytes to move
+ subq t0, 1, t0 // decrement byte within quadword
+ cmplt t0, zero, t3 // finished the quadword?
+ beq t3, 10b // if eq [false], do next byte
+
+20:
+ stq_u t1, 0(a0) // store merged destination bytes
+
+ subq a0, 8, a0 // move to previous quadword
+ bic a0, 7, a0 // aligned quadword
+
+ subq a1, 8, a1 // move to previous quadword
+ bic a1, 7, a1 // aligned quadword
+
+//
+// Check for 64-byte block moves
+//
+
+30:
+
+ srl a2, 6, t0 // t0 = number of 64 byte blocks
+ beq t0, 50f // if eq, no blocks to move
+ and a2, 64-1, a2 // a2 = residual bytes
+
+40:
+ ldq t1, 0(a1) // load 64 bytes from source into
+ subq a0, 64, a0 // decrement destination pointer
+ ldq v0, -56(a1) //
+ ldq a3, -32(a1) //
+ stq t1, 64(a0) // write to destination
+ ldq t2, -8(a1) // into volatile registers
+ ldq a5, -48(a1) //
+ ldq a4, -40(a1) //
+ stq t2, 56(a0) //
+ ldq t3, -16(a1) //
+ ldq t4, -24(a1) //
+ subq a1, 64, a1 // decrement source pointer
+ stq t3, 48(a0) //
+ stq t4, 40(a0) //
+ stq a3, 32(a0) //
+ subq t0, 1, t0 // decrement number of blocks
+ stq a4, 24(a0) //
+ stq a5, 16(a0) //
+ stq v0, 8(a0) //
+ bne t0, 40b // if ne, more blocks to copy
+
+//
+// Copy quadwords
+//
+
+50:
+ srl a2, 3, t0 // t0 = number of quadwords to move
+ beq t0, 70f // if eq no quadwords to move
+ and a2, 8-1, a2 // a2 = residual bytes
+
+60:
+ ldq t1, 0(a1) // load quadword from source
+ subq a1, 8, a1 // decrement source pointer
+ stq t1, 0(a0) // store quadword to destination
+ subq a0, 8, a0 // decrement destination pointer
+ subq t0, 1, t0 // decrement quadwords to move
+ bne t0, 60b // if ne, more quadwords to move
+
+//
+// Move final residual bytes
+//
+
+70:
+ beq a2, 90f // if eq, no more bytes to move
+ ldq t1, 0(a1) // get last source quadword
+ ldq t2, 0(a0) // get last destination quadword
+ bis zero, 7, t0 // t0 = next byte number to move
+
+80:
+ extbl t1, t0, t3 // extract byte from source
+ insbl t3, t0, t3 // t3 = source byte, in position
+ mskbl t2, t0, t2 // clear byte position for dest.
+ bis t2, t3, t2 // merge in source byte
+ subq t0, 1, t0 // decrement byte position
+ subq a2, 1, a2 // decrement bytes to move
+ bne a2, 80b // if ne, more bytes to move
+
+ stq t2, 0(a0) // write destination data
+//
+// Finish aligned MoveBackward
+//
+
+90:
+
+ ret zero, (ra) // return
+
+
+//
+// Move memory backward unaligned.
+//
+
+MoveBackwardUnaligned: //
+
+
+//
+// Move bytes until the destination is aligned
+//
+
+ and a0, 0x7, t0 // t0 = unaligned bits
+ cmpeq t0, 7, t1 // last byte of a quadword
+ beq t1, 95f // if eq[false], not aligned
+ subq a0, 7, a0 // align pointer to beginning of quad
+ br zero, 120f //
+
+95:
+ ldq_u t2, 0(a0) // get unaligned quad from dest.
+
+100:
+ beq a2, 110f // if eq, no more bytes to move
+ ldq_u t1, 0(a1) // get unaligned quad from source
+ extbl t1, a1, t1 // extract source byte
+ insbl t1, t0, t1 // t1 = source byte in position
+ mskbl t2, t0, t2 // clear byte position in dest.
+ bis t2, t1, t2 // merge source byte
+ subq t0, 1, t0 // decrement byte position
+ subq a1, 1, a1 // decrement source pointer
+ subq a2, 1, a2 // decrement number of bytes to move
+ cmplt t0, zero, t3 // t0 < 0? => quad finished
+ beq t3, 100b // if eq [false], more bytes to move
+
+110:
+ stq_u t2, 0(a0) // store merged quadword
+
+ subq a0, 8, a0 // decrement dest. to previous quad
+ bic a0, 7, a0 // align previous quadword
+
+//
+// Check for 64-byte blocks to move
+//
+
+120:
+
+ srl a2, 6, t0 // t0 = number of blocks to move
+ subq a1, 7, a1 // point to beginning of last quad
+ beq t0, 140f // if eq no blocks to move
+ and a2, 64-1, a2 // a2 = residual bytes to move
+
+ ldq_u t1, 7(a1) // t1 = first unaligned quad
+
+130:
+ // get source data and merge it
+ // as we go
+ ldq_u t2, 0(a1) // t2 = second unaligned quad
+ extqh t1, a1, t1 // extract applicable bytes from t1
+ extql t2, a1, v0 // extract applicable bytes from t2
+ bis t1, v0, t1 // t1 = quad #1
+ ldq_u t3, -8(a1) // t3 = third unaligned quad
+ extqh t2, a1, t2 // extract applicable bytes from t2
+ extql t3, a1, v0 // extract applicable bytes from t3
+ stq t1, 0(a0) // store quad #1
+ bis t2, v0, t2 // t2 = quad #2
+ ldq_u t4, -16(a1) // t4 = fourth unaligned quad
+ extqh t3, a1, t3 // extract applicable bytes from t3
+ extql t4, a1, v0 // extract applicable bytes from t4
+ stq t2, -8(a0) // store quad #2
+ bis t3, v0, t3 // t3 = quad #3
+ ldq_u t5, -24(a1) // t5 = fifth unaligned quad
+ extqh t4, a1, t4 // extract applicable bytes from t4
+ extql t5, a1, v0 // extract applicable bytes from t5
+ stq t3, -16(a0) // store quad #3
+ bis t4, v0, t4 // t4 = quad #4
+ ldq_u a3, -32(a1) // a3 = sixth unaligned quad
+ extqh t5, a1, t5 // extract applicable bytes from t5
+ extql a3, a1, v0 // extract applicable bytes from a3
+ stq t4, -24(a0) // store quad #4
+ bis t5, v0, t5 // t5 = quad #5
+ ldq_u a4, -40(a1) // a4 = seventh unaligned quad
+ extqh a3, a1, a3 // extract applicable bytes from a3
+ extql a4, a1, v0 // extract applicable bytes from a4
+ stq t5, -32(a0) // store quad #5
+ bis a3, v0, a3 // a3 = quad #6
+ ldq_u a5, -48(a1) // a5 = eighth unaligned quad
+ extqh a4, a1, a4 // extract applicable bytes from a4
+ extql a5, a1, v0 // extract applicable bytes from a5
+ stq a3, -40(a0) // store quad #6
+ bis a4, v0, a4 // a4 = quad #7
+ ldq_u t1, -56(a1) // t1 = ninth unaligned = 1st of next
+ extqh a5, a1, a5 // extract applicable bytes from a5
+ extql t1, a1, v0 // extract applicable bytes from t1
+ stq a4, -48(a0) // store quad #7
+ bis a5, v0, a5 // a5 = quad #8
+ subq a1, 64, a1 // increment source pointer
+ stq a5, -56(a0) // store quad #8
+ subq a0, 64, a0 // increment destination pointer
+ subq t0, 1, t0 // decrement number of blocks
+ bne t0, 130b // if ne, more blocks to move
+
+
+//
+// Move unaligned source quads to aligned destination quads
+//
+
+140:
+ srl a2, 3, t0 // t0 = number of quads to move
+ beq t0, 160f // if eq no quads to move
+ and a2, 8-1, a2 // a2 = residual bytes
+
+ ldq_u t1, 7(a1) // t1 = first unaligned quad
+
+150:
+ ldq_u t2, 0(a1) // t2 = second unaligned quad
+ subq a0, 8, a0 // decrement destination pointer
+ extqh t1, a1, t1 // extract applicable bytes from t1
+ extql t2, a1, v0 // extract applicable bytes from t2
+ bis t1, v0, t1 // t1 = quadword of data
+ stq t1, 8(a0) // store data to destination
+ subq a1, 8, a1 // decrement source pointer
+ subq t0, 1, t0 // decrement quads to move
+ bis t2, zero, t1 // t1 = first of next unaligned pair
+ bne t0, 150b // if ne, more quads to move
+
+//
+// Move remaining bytes to final quadword
+//
+
+160:
+ beq a2, 180f // if eq, no more bytes to move
+ ldq t2, 0(a0) // t2 = destination quadword
+ bis zero, 7, t0 // t0 = position for next insertion
+
+170:
+ subq a1, 1, a1 // decrement source pointer
+ ldq_u t1, 8(a1) // get unaligned source quad
+ extbl t1, a1, t1 // t1 = source byte
+ insbl t1, t0, t1 // t1 = source byte, in position
+ mskbl t2, t0, t2 // clear byte position
+ bis t2, t1, t2 // merge in source byte
+ subq t0, 1, t0 // decrement byte position for dest.
+ subq a2, 1, a2 // decrement bytes to move
+ bne a2, 170b // if ne, more bytes to move
+
+ stq t2, 0(a0) //
+
+//
+// Finish unaligned MoveBackward
+//
+
+180:
+ ret zero, (ra) // return
+
+ .end RtlMoveMemory
+
+ SBTTL("Zero Memory")
+//++
+//
+// VOID
+// RtlZeroMemory (
+// IN PVOID Destination,
+// IN ULONG Length
+// )
+//
+// Routine Description:
+//
+// This function zeros memory by first aligning the destination address to
+// a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte
+// blocks, followed by any remaining bytes.
+//
+// Arguments:
+//
+// Destination (a0) - Supplies a pointer to the memory to zero.
+//
+// Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
+//
+// Return Value:
+//
+// None.
+//
+//--
+
+ LEAF_ENTRY(RtlZeroMemory)
+
+ bis zero, zero, a2 // set fill pattern
+ br zero, RtlpFillMemory //
+
+
+ SBTTL("Fill Memory")
+//++
+//
+// VOID
+// RtlFillMemory (
+// IN PVOID Destination,
+// IN ULONG Length,
+// IN UCHAR Fill
+// )
+//
+// Routine Description:
+//
+// This function fills memory by first aligning the destination address to
+// a longword boundary, and then filling 32-byte blocks, followed by 4-byte
+// blocks, followed by any remaining bytes.
+//
+// Arguments:
+//
+// Destination (a0) - Supplies a pointer to the memory to fill.
+//
+// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
+//
+// Fill (a2) - Supplies the fill byte.
+//
+// N.B. The alternate entry memset expects the length and fill arguments
+// to be reversed. It also returns the Destination pointer
+//
+// Return Value:
+//
+// None.
+//
+//--
+
+ ALTERNATE_ENTRY(memset)
+
+ bis a0, zero, v0 // set return value
+ bis a1, zero, a3 // swap length and fill arguments
+ bis a2, zero, a1 //
+ bis a3, zero, a2 //
+
+ ALTERNATE_ENTRY(RtlFillMemory)
+
+ and a2, 0xff, a2 // clear excess bits
+ sll a2, 8, t0 // duplicate fill byte
+ bis a2, t0, a2 // generate fill word
+ sll a2, 16, t0 // duplicate fill word
+ bis a2, t0, a2 // generate fill longword
+ sll a2, 32, t0 // duplicate fill longword
+ bis a2, t0, a2 // generate fill quadword
+
+.align 3 // ensure quadword aligned target
+//
+// Fill memory with the pattern specified in register a2.
+//
+
+RtlpFillMemory: //
+
+//
+// Align destination to quadword
+//
+
+ beq a1, 80f // anything to fill? (paranoia)
+ and a0, 8-1, t0 // t0 = unaligned bits
+ bne t0, 5f // if ne, then not quad aligned
+ br zero, 20f // if eq, then quad aligned
+
+5:
+ ldq_u t1, 0(a0) // get unaligned quadword
+ // for first group of bytes
+10:
+ beq a1, 15f // if eq no more bytes to fill
+ insbl a2, t0, t2 // get fill byte into position
+ mskbl t1, t0, t1 // clear byte for fill
+ bis t1, t2, t1 // put in fill byte
+ addq t0, 1, t0 // increment to next byte position
+ subq a1, 1, a1 // decrement bytes to fill
+ cmpeq t0, 8, t2 // t0 = 8?
+ beq t2, 10b // if eq [false] more bytes to do
+
+15:
+ stq_u t1, 0(a0) // store modified bytes
+ addq a0, 7, a0 // move a0 to next quadword
+ bic a0, 7, a0 // align a0 to quadword
+
+//
+// Check for 64-byte blocks
+//
+
+20:
+ srl a1, 6, t0 // t0 = number of 64 byte blocks
+ beq t0, 40f // if eq then no 64 byte blocks
+ and a1, 64-1, a1 // a1 = residual bytes to fill
+
+30:
+ stq a2, 0(a0) // store 64 bytes
+ stq a2, 8(a0) //
+ stq a2, 16(a0) //
+ stq a2, 24(a0) //
+ stq a2, 32(a0) //
+ stq a2, 40(a0) //
+ stq a2, 48(a0) //
+ stq a2, 56(a0) //
+
+ subq t0, 1, t0 // decrement blocks remaining
+ addq a0, 64, a0 // increment destination pointer
+ bne t0, 30b // more blocks to write
+
+
+
+//
+// Fill aligned quadwords
+//
+
+40:
+ srl a1, 3, t0 // t0 = number of quadwords
+ bne t0, 55f // if ne quadwords left to fill
+ br zero, 60f // if eq no quadwords left
+
+55:
+ and a1, 8-1, a1 // a1 = residual bytes to fill
+
+50:
+ stq a2, 0(a0) // store quadword
+ subq t0, 1, t0 // decrement quadwords remaining
+ addq a0, 8, a0 // next quadword
+ bne t0, 50b // more quadwords to write
+
+
+//
+// Fill bytes for last quadword
+//
+
+60:
+ bne a1, 65f // if ne bytes remain to be filled
+ br zero, 80f // if eq no more bytes to fill
+
+65:
+ ldq t1, 0(a0) // get last quadword
+ bis zero, zero, t0 // t0 = byte position to start fill
+
+70:
+ beq a1, 75f // if eq, no more bytes to fill
+ insbl a2, t0, t2 // get fill byte into position
+ mskbl t1, t0, t1 // clear fill byte position
+ bis t1, t2, t1 // insert fill byte
+ addq t0, 1, t0 // increment byte within quad
+ subq a1, 1, a1 // decrement bytes to fill
+ cmpeq t0, 8, t3 // t0 = 8? => finished quad
+ beq t3, 70b // if eq [false] more bytes to fill
+
+75:
+ stq t1, 0(a0) // write merged quadword
+
+//
+// Finish up
+//
+
+80:
+ ret zero, (ra) // return
+
+
+ .end RtlZeroMemory
+
+ SBTTL("Fill Memory Ulong")
+//++
+//
+// VOID
+// RtlFillMemoryUlong (
+// IN PVOID Destination,
+// IN ULONG Length,
+// IN ULONG Pattern
+// )
+//
+// Routine Description:
+//
+// This function fills memory with the specified longowrd pattern by
+// filling 64-byte blocks followed by 8-byte blocks and finally
+// 4-byte blocks.
+//
+// N.B. This routine assumes that the destination address is aligned
+// on a longword boundary and that the length is an even multiple
+// of longwords.
+//
+// Arguments:
+//
+// Destination (a0) - Supplies a pointer to the memory to fill.
+//
+// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
+//
+// Pattern (a2) - Supplies the fill pattern.
+//
+// Return Value:
+//
+// None.
+//
+//--
+
+ LEAF_ENTRY(RtlFillMemoryUlong)
+
+ bic a1, 3, a1 // make sure length is an even number
+ // of longwords
+ sll a2, 32, a3 // a3 = long pattern in upper 32 bits
+ srl a3, 32, t0 // clear upper bits, pattern in lower 32
+ bis a3, t0, a3 // a3 = quad version of fill pattern
+
+//
+// Make destination address quad-aligned
+//
+
+ and a0, 4, t0 // is a0 quad aligned?
+ beq t0, 10f // if eq, then a0 quad aligned
+ stl a2, 0(a0) // fill first longword
+ addq a0, 4, a0 // quad align a0
+ subq a1, 4, a1 // bytes remaining to store
+
+//
+// Check for 64-byte blocks to fill
+//
+
+10:
+ srl a1, 6, t0 // t0 = # 64-byte blocks to fill
+ beq t0, 30f // if eq no 64 byte blocks
+ and a1, 64-1, a1 // a1 = residual bytes
+
+20:
+ stq a3, 0(a0) // store 64 bytes
+ stq a3, 8(a0) //
+ stq a3, 16(a0) //
+ stq a3, 24(a0) //
+ stq a3, 32(a0) //
+ stq a3, 40(a0) //
+ stq a3, 48(a0) //
+ stq a3, 56(a0) //
+ subq t0, 1, t0 // t0 = blocks remaining
+ addq a0, 64, a0 // increment address pointer
+ bne t0, 20b // if ne more blocks to fill
+
+//
+// Fill 8 bytes at a time while we can, a1 = bytes remaining
+//
+
+30:
+ srl a1, 3, t0 // t0 = # quadwords to fill
+ beq t0, 50f // if eq no quadwords left
+ and a1, 8-1, a1 // a1 = residual bytes
+40:
+ stq a3, 0(a0) // store quadword
+ subq t0, 1, t0 // t0 = quadwords remaining
+ addq a0, 8, a0 // increment address pointer
+ bne t0, 40b // if ne more quadwords to fill
+
+//
+// Fill last 4 bytes
+//
+
+50:
+ beq a1, 60f // if eq no longwords remain
+ stl a2, 0(a0) // fill last longword
+
+//
+// Finish up
+//
+
+60:
+ ret zero, (ra) // return to caller
+
+
+ .end RtlFillMemoryUlong
+
+ SBTTL("Copy Memory With Byte Granularity")
+//++
+//
+// VOID
+// RtlCopyBytes (
+// IN PVOID Destination,
+// IN PVOID Source,
+// IN ULONG Length
+// )
+//
+// Routine Description:
+//
+// This function copies non-overlapping memory, aligned or unaligned, in
+// 64-byte blocks, followed by 8-byte blocks, followed by any remaining
+// bytes. Unlike RtlCopyMemory or RtlMoveMemory the copy is done such
+// that byte granularity is assured for all platforms.
+//
+// Arguments:
+//
+// Destination (a0) - Supplies a pointer to the destination address of
+// the move operation.
+//
+// Source (a1) - Supplies a pointer to the source address of the move
+// operation.
+//
+// Length (a2) - Supplies the length, in bytes, of the memory to be moved.
+//
+// Return Value:
+//
+// None.
+//
+//--
+
+ LEAF_ENTRY(RtlCopyBytes)
+
+//
+// Move memory forward aligned and unaligned.
+//
+
+ xor a0, a1, t0 // compare alignment bits
+ and t0, 0x7, t0 // isolate alignment comparison
+ bne t0, CopyForwardUnaligned // if ne, incompatible alignment
+
+//
+// Source and Destination buffers have the same alignment. Move
+// bytes until done or source and destination are quadword aligned
+//
+
+ and a0, 0x7, t0 // t0 = unaligned bits
+ bne t0, 5f // if ne, not quad aligned
+ br zero, 20f // predicted taken
+5:
+ bis zero, zero, t1 // t4 = destination byte zap mask
+ bis zero, 1, t2
+ sll t2, t0, t2 // t2 = next bit to set in zap mask
+10:
+ beq a2, 15f // if eq, all bits set
+ bis t1, t2, t1 // set bit in zap mask
+ sll t2, 1, t2 // set next higher bit for zap mask
+ subq a2, 1, a2 // decrement bytes to move
+ addq t0, 1, t0 // increment byte within quad
+ cmpeq t0, 8, t3 // finished the quadword?
+ beq t3, 10b // if eq [false], do next byte
+15:
+ ldq_u t2, 0(a1) // get unaligned quadword from source
+ zapnot t2, t1, t2 // clear source bytes
+ bic a0, 7, a3 // a3 = quadword base of destination
+retry1:
+ ldq_l t0, 0(a3) // load destination quadword
+ zap t0, t1, t0 // clear destination bytes
+ or t0, t2, t0 // merge in bytes from source
+ stq_c t0, 0(a3) // store merged quadword conditional
+ beq t0, retry1f // if eq, retry failed interlock
+
+ addq a0, 7, a0 // move to next quadword
+ bic a0, 7, a0 // aligned quadword
+
+ addq a1, 7, a1 // move to next quadword
+ bic a1, 7, a1 // aligned quadword
+
+//
+// Check for 64-byte block moves
+//
+
+20:
+ srl a2, 6, t0 // t0 = number of 64 byte blocks
+ beq t0, 40f // if eq no blocks to move
+ and a2, 64-1, a2 // a2 = residual bytes
+
+30:
+ ldq t1, 0(a1) // load 64 bytes from source
+ addq a0, 64, a0 // increment destination pointer
+ ldq v0, 56(a1) //
+ ldq a3, 32(a1) //
+ stq t1, -64(a0) // write to destination
+ ldq t2, 8(a1) // into volatile registers
+ ldq t3, 16(a1) //
+ ldq t4, 24(a1) //
+ subq t0, 1, t0 // decrement number of blocks
+ stq t2, -56(a0) //
+ ldq a4, 40(a1) //
+ stq t3, -48(a0) //
+ ldq a5, 48(a1) //
+ stq t4, -40(a0) //
+ addq a1, 64, a1 // increment source pointer
+ stq a3, -32(a0) //
+ stq a4, -24(a0) //
+ stq a5, -16(a0) //
+ stq v0, -8(a0) //
+ bne t0, 30b // if ne, more blocks to copy
+
+//
+// Copy quadwords
+//
+
+40:
+ srl a2, 3, t0 // t0 = number of quadwords to move
+ beq t0, 60f // if eq no quadwords to move
+ and a2, 8-1, a2 // a2 = residual bytes
+
+50:
+ ldq t1, 0(a1) // load quadword from source
+ addq a1, 8, a1 // increment source pointer
+ stq t1, 0(a0) // store quadword to destination
+ addq a0, 8, a0 // increment destination pointer
+ subq t0, 1, t0 // decrement number of quadwords
+ bne t0, 50b // if ne, more quadwords to move
+
+//
+// Move final residual bytes
+//
+
+60:
+ beq a2, 80f // if eq, no more bytes to move
+ mov a2, t0 // t0 = number of bytes to move
+ mov -1, t1 // t1 = bit mask
+ sll t0, 3, t0 // # of bytes to # of bits
+ srl t1, t0, t1 // clear t0 bits
+ sll t1, t0, t0 // move it back
+ ldq t1, 0(a1) // get last source quadword
+ bic t1, t0, t1 // clear bytes not copied
+ not t0, t0 // complement to clear destination
+retry2:
+ ldq_l t2, 0(a0) // get last destination quadword locked
+ bic t2, t0, t2 // clear bytes to be copied
+ bis t2, t1, t2 // move bytes from source
+ stq_c t2, 0(a0) // store merged quadword conditional
+ beq t2, retry2f // if eq, retry failed interlock
+
+//
+// Finish aligned MoveForward
+//
+
+80:
+ ret zero, (ra) // return
+
+//
+// Move memory forward unaligned.
+//
+
+CopyForwardUnaligned: //
+
+//
+// Move bytes until the destination is aligned
+//
+
+ and a0, 0x7, t0 // t0 = unaligned bits
+ beq t0, 100f // if eq, destination quad aligned
+ bis zero, zero, t1 // t4 = destination byte zap mask
+ bis zero, 1, t2
+ sll t2, t0, t2 // t2 = next bit to set in zap mask
+ mov zero, t4 // assemble destination bytes here
+90:
+ beq a2, 95f // if eq no more bytes to move
+ bis t1, t2, t1 // set bit in zap mask
+ sll t2, 1, t2 // set next higher bit for zap mask
+ ldq_u t5, 0(a1) // get unaligned quad from source
+ extbl t5, a1, t5 // extract source byte
+ insbl t5, t0, t5 // t5 = source byte, in position
+ or t4, t5, t4 // merge in source byte
+ addq t0, 1, t0 // increment byte position
+ addq a1, 1, a1 // increment source pointer
+ subq a2, 1, a2 // decrement bytes to move
+ cmpeq t0, 8, t3 // t0 = 8? => quad finished
+ beq t3, 90b // if eq [false], more bytes to move
+95:
+ bic a0, 0x7, a3 // a3 = quadword base of destination
+retry3:
+ ldq_l t0, 0(a3) // load destination quadword
+ zap t0, t1, t0 // clear destination bytes
+ or t0, t4, t0 // merge in bytes from source
+ stq_c t0, 0(a3) // store merged quadword conditional
+ beq t0, retry3f // if eq, retry failed interlock
+
+ addq a0, 7, a0 // increment to next quad
+ bic a0, 7, a0 // align next quadword
+
+//
+// Check for 64-byte blocks to move
+//
+
+100:
+ srl a2, 6, t0 // t0 = number of blocks to move
+ beq t0, 120f // if eq no blocks to move
+ and a2, 64-1, a2 // a2 = residual bytes to move
+
+ ldq_u t1, 0(a1) // t1 = first unaligned quad
+110:
+ // get source data and merge it
+ // as we go
+ ldq_u t2, 7(a1) // t2 = second unaligned quad
+ extql t1, a1, t1 // extract applicable bytes from t1
+ extqh t2, a1, v0 // extract applicable bytes from t2
+ bis t1, v0, t1 // t1 = quad #1
+ ldq_u t3, 15(a1) // t3 = third unaligned quad
+ extql t2, a1, t2 // extract applicable bytes from t2
+ extqh t3, a1, v0 // extract applicable bytes from t3
+ stq t1, 0(a0) // store quad #1
+ bis t2, v0, t2 // t2 = quad #2
+ ldq_u t4, 23(a1) // t4 = fourth unaligned quad
+ extql t3, a1, t3 // extract applicable bytes from t3
+ extqh t4, a1, v0 // extract applicable bytes from t4
+ stq t2, 8(a0) // store quad #2
+ bis t3, v0, t3 // t3 = quad #3
+ ldq_u t5, 31(a1) // t5 = fifth unaligned quad
+ extql t4, a1, t4 // extract applicable bytes from t4
+ extqh t5, a1, v0 // extract applicable bytes from t5
+ stq t3, 16(a0) // store quad #3
+ bis t4, v0, t4 // t4 = quad #4
+ ldq_u a3, 39(a1) // a3 = sixth unaligned quad
+ extql t5, a1, t5 // extract applicable bytes from t5
+ extqh a3, a1, v0 // extract applicable bytes from a3
+ stq t4, 24(a0) // store quad #4
+ bis t5, v0, t5 // t5 = quad #5
+ ldq_u a4, 47(a1) // a4 = seventh unaligned quad
+ extql a3, a1, a3 // extract applicable bytes from a3
+ extqh a4, a1, v0 // extract applicable bytes from a4
+ stq t5, 32(a0) // store quad #5
+ bis a3, v0, a3 // a3 = quad #6
+ ldq_u a5, 55(a1) // a5 = eighth unaligned quad
+ extql a4, a1, a4 // extract applicable bytes from a4
+ extqh a5, a1, v0 // extract applicable bytes from a5
+ stq a3, 40(a0) // store quad #6
+ bis a4, v0, a4 // a4 = quad #7
+ ldq_u t1, 63(a1) // t1 = ninth unaligned = 1st of next
+ extql a5, a1, a5 // extract applicable bytes from a5
+ extqh t1, a1, v0 // extract applicable bytes from t1
+ stq a4, 48(a0) // store quad #7
+ bis a5, v0, a5 // a5 = quad #8
+ addq a1, 64, a1 // increment source pointer
+ stq a5, 56(a0) // store quad #8
+ addq a0, 64, a0 // increment destination pointer
+ subq t0, 1, t0 // decrement number of blocks
+ bne t0, 110b // if ne, more blocks to move
+
+//
+// Move unaligned source quads to aligned destination quads
+//
+
+120:
+ srl a2, 3, t0 // t0 = number of quads to move
+ beq t0, 140f // if eq no quads to move
+ and a2, 8-1, a2 // a2 = residual bytes
+
+
+ ldq_u t1, 0(a1) // t1 = first unaligned quad
+130:
+ ldq_u t2, 7(a1) // t2 = second unaligned quad
+ addq a0, 8, a0 // increment destination pointer
+ extql t1, a1, t1 // extract applicable bytes from t1
+ extqh t2, a1, v0 // extract applicable bytes from t2
+ bis t1, v0, t1 // t1 = quadword of data
+ stq t1, -8(a0) // store data to destination
+ addq a1, 8, a1 // increment source pointer
+ subq t0, 1, t0 // decrement quads to move
+ bis t2, zero, t1 // t1 = first of next unaligned pair
+ bne t0, 130b // if ne, more quads to move
+
+//
+// Move remaining bytes to final quadword
+//
+
+140:
+ beq a2, 160f // if eq no more bytes to move
+
+ mov zero, t3 // t3 = position for next insertion
+ mov zero, t4 // assemble destination bytes here
+ mov a2, t0 // t0 = number of bytes to move
+ mov -1, t1 // t1 = bit mask
+ sll t0, 3, t0 // # of bytes to # of bits
+ srl t1, t0, t1 // clear t0 bits
+ sll t1, t0, t0 // move it back
+ not t0, t0 // complement for destination clear mask
+150:
+ ldq_u t1, 0(a1) // get unaligned source quad
+ extbl t1, a1, t1 // t1 = source byte
+ insbl t1, t3, t1 // t1 = source byte, in position
+ bis t4, t1, t4 // merge in source byte
+ addq a1, 1, a1 // increment source pointer
+ subq a2, 1, a2 // decrement bytes to move
+ addq t3, 1, t3 // increment destination position
+ bne a2, 150b // more bytes to move
+retry4:
+ ldq_l t2, 0(a0) // get last destination quadword locked
+ bic t2, t0, t2 // clear bytes to be copied
+ bis t2, t4, t2 // move bytes from source
+ stq_c t2, 0(a0) // store merged quadword conditional
+ beq t2, retry4f // if eq, retry failed interlock
+
+//
+// Finish unaligned MoveForward
+//
+
+160:
+ ret zero, (ra) // return
+
+//
+// Out of line branches for failed store conditional.
+// Don't need to restore anything, just try again.
+//
+
+retry1f:
+ br retry1
+retry2f:
+ br retry2
+retry3f:
+ br retry3
+retry4f:
+ br retry4
+
+ .end RtlCopyBytes
+
+ SBTTL("Zero Bytes")
+//++
+//
+// VOID
+// RtlZeroBytes (
+// IN PVOID Destination,
+// IN ULONG Length
+// )
+//
+// Routine Description:
+//
+// This function zeros memory by first aligning the destination address to
+// a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte
+// blocks, followed by any remaining bytes. Unlike RtlZeroMemory the copy is
+// done such that byte granularity is assured for all platforms.
+//
+// Arguments:
+//
+// Destination (a0) - Supplies a pointer to the memory to zero.
+//
+// Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
+//
+// Return Value:
+//
+// None.
+//
+//--
+
+ LEAF_ENTRY(RtlZeroBytes)
+
+ bis zero, zero, a2 // set fill pattern
+ br zero, RtlpFillBytes //
+
+
+ SBTTL("Fill Bytes")
+//++
+//
+// VOID
+// RtlFillBytes (
+// IN PVOID Destination,
+// IN ULONG Length,
+// IN UCHAR Fill
+// )
+//
+// Routine Description:
+//
+// This function fills memory by first aligning the destination address to
+// a longword boundary, and then filling 32-byte blocks, followed by 4-byte
+// blocks, followed by any remaining bytes. Unlike RtlFillMemory the copy is
+// done such that byte granularity is assured for all platforms.
+//
+// Arguments:
+//
+// Destination (a0) - Supplies a pointer to the memory to fill.
+//
+// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
+//
+// Fill (a2) - Supplies the fill byte.
+//
+// N.B. The alternate entry memset expects the length and fill arguments
+// to be reversed. It also returns the Destination pointer
+//
+// Return Value:
+//
+// None.
+//
+//--
+
+ ALTERNATE_ENTRY(RtlFillBytes)
+
+ and a2, 0xff, a2 // clear excess bits
+ sll a2, 8, t0 // duplicate fill byte
+ bis a2, t0, a2 // generate fill word
+ sll a2, 16, t0 // duplicate fill word
+ bis a2, t0, a2 // generate fill longword
+ sll a2, 32, t0 // duplicate fill longword
+ bis a2, t0, a2 // generate fill quadword
+
+.align 3 // ensure quadword aligned target
+//
+// Fill memory with the pattern specified in register a2.
+//
+
+RtlpFillBytes: //
+
+//
+// Align destination to quadword
+//
+
+ beq a1, 80f // anything to fill? (paranoia)
+ and a0, 8-1, t0 // t0 = unaligned bits
+ bne t0, 5f // if ne, then not quad aligned
+ br zero, 20f // if eq, then quad aligned
+
+5:
+ bis zero, zero, t1 // t4 = destination byte zap mask
+ bis zero, 1, t2
+ sll t2, t0, t2 // t2 = next bit to set in zap mask
+10:
+ beq a1, 15f // if eq, all bits set
+ bis t1, t2, t1 // set bit in zap mask
+ sll t2, 1, t2 // set next higher bit for zap mask
+ subq a1, 1, a1 // decrement bytes to fill
+ addq t0, 1, t0 // increment byte within quad
+ cmpeq t0, 8, t3 // finished the quadword?
+ beq t3, 10b // if eq [false], do next byte
+15:
+ zapnot a2, t1, t2 // clear fill bytes
+ bic a0, 7, a3 // a3 = quadword base of destination
+retry5:
+ ldq_l t0, 0(a3) // load destination quadword
+ zap t0, t1, t0 // clear destination bytes
+ or t0, t2, t0 // merge in fill bytes
+ stq_c t0, 0(a3) // store merged quadword conditional
+ beq t0, retry5f // if eq, retry failed interlock
+
+ addq a0, 7, a0 // move a0 to next quadword
+ bic a0, 7, a0 // align a0 to quadword
+
+//
+// Check for 64-byte blocks
+//
+
+20:
+ srl a1, 6, t0 // t0 = number of 64 byte blocks
+ beq t0, 40f // if eq then no 64 byte blocks
+ and a1, 64-1, a1 // a1 = residual bytes to fill
+
+30:
+ stq a2, 0(a0) // store 64 bytes
+ stq a2, 8(a0) //
+ stq a2, 16(a0) //
+ stq a2, 24(a0) //
+ stq a2, 32(a0) //
+ stq a2, 40(a0) //
+ stq a2, 48(a0) //
+ stq a2, 56(a0) //
+
+ subq t0, 1, t0 // decrement blocks remaining
+ addq a0, 64, a0 // increment destination pointer
+ bne t0, 30b // more blocks to write
+
+
+
+//
+// Fill aligned quadwords
+//
+
+40:
+ srl a1, 3, t0 // t0 = number of quadwords
+ bne t0, 55f // if ne quadwords left to fill
+ br zero, 60f // if eq no quadwords left
+
+55:
+ and a1, 8-1, a1 // a1 = residual bytes to fill
+
+50:
+ stq a2, 0(a0) // store quadword
+ subq t0, 1, t0 // decrement quadwords remaining
+ addq a0, 8, a0 // next quadword
+ bne t0, 50b // more quadwords to write
+
+//
+// Fill bytes for last quadword
+//
+
+60:
+ beq a1, 80f // if eq no more bytes to fill
+
+ mov a1, t0 // t0 = number of bytes to move
+ mov -1, t1 // t1 = bit mask
+ sll t0, 3, t0 // # of bytes to # of bits
+ srl t1, t0, t1 // clear t0 bits
+ sll t1, t0, t0 // move it back
+ bic a2, t0, t1 // clear fill bytes not copied
+ not t0, t0 // complement to clear destination
+retry6:
+ ldq_l t2, 0(a0) // get last destination quadword locked
+ bic t2, t0, t2 // clear bytes to be copied
+ bis t2, t1, t2 // move bytes from source
+ stq_c t2, 0(a0) // store merged quadword conditional
+ beq t2, retry6f // if eq, retry failed interlock
+
+//
+// Finish up
+//
+
+80:
+ ret zero, (ra) // return
+
+//
+// Out of line branches for failed store conditional.
+// Don't need to restore anything, just try again.
+//
+
+retry5f:
+ br retry5
+retry6f:
+ br retry6
+
+ .end RtlZeroBytes