// TITLE("Compare, Move, Zero, and Fill Memory Support")
//++
//
// Copyright (c) 1992 Digital Equipment Corporation
//
// Module Name:
//
// mvmem.s
//
// Abstract:
//
// This module implements functions to compare, move, zero, and fill
// blocks of memory. If the memory is aligned, then these functions
// are very efficient.
//
// N.B. These routines MUST preserve all floating state since they are
// frequently called from interrupt service routines that normally
// do not save or restore floating state.
//
// Author:
//
// Joe Notarangelo 21-May-1992
//
// Environment:
//
// User or Kernel mode.
//
// Revision History:
//
// Monty VanderBilt 14-Feb-1996 Avoid memory loads and branch takens between
// load lock and store conditional instructions
// to conform with all alpha architecture rules.
// Monty VanderBilt 27-Feb-1996 Added RtlZeroBytes and RtlFillBytes to support
// byte granularity access when necessary.
//--
#include "ksalpha.h"
SBTTL("Compare Memory")
//++
//
// ULONG
// RtlCompareMemory (
// IN PVOID Source1,
// IN PVOID Source2,
// IN ULONG Length
// )
//
// Routine Description:
//
// This function compares two blocks of memory and returns the number
// of bytes that compared equal.
//
// Arguments:
//
// Source1 (a0) - Supplies a pointer to the first block of memory to
// compare.
//
// Source2 (a1) - Supplies a pointer to the second block of memory to
// compare.
//
// Length (a2) - Supplies the length, in bytes, of the memory to be
// compared.
//
// Return Value:
//
// The number of bytes that compared equal is returned as the function
// value. If all bytes compared equal, then the length of the orginal
// block of memory is returned.
//
//--
LEAF_ENTRY(RtlCompareMemory)
bis a2, zero, v0 // save length of comparison
beq a2, 90f // (JAE) quit if nothing to compare
xor a0, a1, t0 // check for compatible alignment
and t0, 0x7, t0 // low bits only
bne t0, CompareUnaligned // if ne, incompatible alignment
//
// Compare memory aligned
//
CompareAligned: //
//
// compare memory until sources are aligned
//
and a0, 0x7, t0 // get low bits
bne t0, 10f // if ne, sources not aligned yet
br zero, 30f // already aligned, predicted
10:
ldq_u t1, 0(a0) // get unaligned quad at source 1
ldq_u t2, 0(a1) // get unaligned quad at source 2
20:
extbl t1, t0, t4 // byte at t0 in source 1 quad
extbl t2, t0, t5 // byte at t0 in source 2 quad
xor t4, t5, t3 // t1 = t2 ?
bne t3, 110f // not equal, miscompare
subq a2, 1, a2 // decrement bytes to compare
beq a2, 90f // if eq, compare success
addq t0, 1, t0 // increment pointer within quad
cmpeq t0, 8, t3 // t0 = 8?, if so first quadword done
beq t3, 20b // continue while t0 < 8
addq a0, 8, a0 // increment to next quadword
addq a1, 8, a1 // increment source 2 to next also
bic a0, 7, a0 // align source 1 quadword
bic a1, 7, a1 // align source 2 quadword
//
// aligned block compare, compare blocks of 64 bytes
//
30:
srl a2, 6, t0 // t0 = number of 64 byte blocks
beq t0, 50f // if eq, no 64 byte blocks
//
// N.B. loads from each of the sources were separated in case these
// blocks are fighting for the cache
//
.set noat
40:
ldq t1, 0(a0) // t1 = source 1, quad 0
ldq t2, 8(a0) // t2 = source 1, quad 1
ldq t3, 16(a0) // t3 = source 1, quad 2
addq a1, 64, a1 // increment source 2 pointer
ldq t4, 24(a0) // t4 = source 1, quad 3
ldq t5, -64(a1) // t5 = source 2, quad 0
ldq a4, -56(a1) // a4 = source 2, quad 1
ldq a5, -48(a1) // a5 = source 2, quad 2
xor t1, t5, $at // quad 0 match?
bne $at, 200f // if ne[false], miscompare
ldq t5, -40(a1) // t5 = source 2, quad 3
ldq t1, 32(a0) // t1 = source 1, quad 4
xor t2, a4, $at // quad 1 match?
bne $at, 122f // if ne[false], miscompare
ldq t2, 40(a0) // t2 = source 1, quad 5
xor t3, a5, $at // quad 2 match?
bne $at, 124f // if ne[false], miscompare
ldq t3, 48(a0) // t3 = source 1, quad 6
xor t4, t5, $at // quad 3 match?
bne $at, 126f // if ne[false], miscompare
ldq t4, 56(a0) // t4 = source 1, quad 7
ldq t5, -32(a1) // t5 = source 2, quad 4
addq a0, 64, a0 // increment source 1 pointer
ldq a4, -24(a1) // a4 = source 2, quad 5
subq t0, 1, t0 // decrement blocks to compare
ldq a5, -16(a1) // a5 = source 2, quad 6
xor t1, t5, $at // quad 4 match?
bne $at, 130f // if ne[false], miscompare
ldq t5, -8(a1) // t5 = source 2, quad 7
xor t2, a4, $at // quad 5 match?
bne $at, 132f // if ne[false], miscompare
xor t3, a5, $at // quad 6 match?
bne $at, 134f // if ne[false], miscompare
xor t4, t5, $at // quad 7 match?
bne $at, 136f // if ne[false], miscompare
subq a2, 64, a2 // decrement bytes to compare
bne t0, 40b // if ne, more blocks to compare
.set at
//
// Compare quadwords
//
50:
srl a2, 3, t0 // t0 = number of quadwords to compare
beq t0, 70f // if eq, no quadwords to compare
.set noat
60:
ldq t1, 0(a0) // t1 = quad from source 1
lda a0, 8(a0) // increment source 1 pointer
ldq t2, 0(a1) // t2 = quad from source 2
lda a1, 8(a1) // increment source 2 pointer
xor t1, t2, $at // are quadwords equal?
bne $at, 200f // if ne, miscompare
subq t0, 1, t0 // decrement quads to compare
subq a2, 8, a2 // decrement bytes to compare
bne t0, 60b // if ne, more quads to compare
.set at
//
// Compare bytes in last quadword
//
// a2 = number of bytes to compare, less than 8, greater than zero
// a0, a1, quad-aligned to last quadword
beq a2, 80f // if eq, all bytes compared
.set noat
70:
ldq t1, 0(a0) // t1 = quad at source 1
ldq t2, 0(a1) // t2 = quad at source 2
bis zero, 0xff, t0 // zap mask
sll t0, a2, t0 //
zap t1, t0, t1 // zero bytes not compared
zap t2, t0, t2 // same for source 2
xor t1, t2, $at // compare quadwords
bne $at, 200f // if ne, miscompare
.set at
//
// Successful compare
// v0 already contains full length
//
80:
ret zero, (ra) // return
//
// Sources have incompatible alignment
//
CompareUnaligned:
//
// Compare until source 1 (a0) is aligned
//
and a0, 0x7, t0 // get byte position of pointer
beq t0, 30f // if eq, already aligned
ldq_u t1, 0(a0) // get unaligned quad at a0
10:
ldq_u t2, 0(a1) // get unaligned quad at a1
extbl t1, t0, t4 // get byte to compare from source 1
extbl t2, a1, t2 // get byte to compare from source 2
xor t4, t2, t3 // do bytes match?
bne t3, 110f // if ne, miscompare
subq a2, 1, a2 // decrement bytes to compare
beq a2, 90f // (JAE) quit if nothing left to compare
addq t0, 1, t0 // increment byte within source 1
addq a1, 1, a1 // increment source 2 pointer
cmpeq t0, 8, t3 // finished with source 1 quad?
beq t3, 10b // if eq[false], more to compare
addq a0, 7, a0 // point to next source 1 quad
bic a0, 7, a0 // align to quadword
//
// Compare 64-byte blocks
//
30:
srl a2, 6, t0 // t0 = number of blocks to compare
beq t0, 50f // if eq, no blocks to move
ldq_u t1, 0(a1) // get source 2 unaligned quad 1
.set noat
40:
ldq_u t2, 7(a1) // get source 2 unaligned quad 2
addq a0, 64, a0 // increment source 1 pointer
ldq_u t3, 15(a1) // get source 2 unaligned quad 3
extql t1, a1, t1 // bytes from unaligned quad 1
extqh t2, a1, $at // bytes from unaligned quad 2
ldq_u t4, 23(a1) // get source 2 unaligned quad 4
bis t1, $at, t1 // t1 = quadword 1 (source 2)
ldq_u t5, 31(a1) // get source 2 unaligned quad 5
extql t2, a1, t2 // bytes from unaligned quad 2
extqh t3, a1, $at // bytes from unaligned quad 3
ldq a3, -64(a0) // a3 = quadword 1 (source 1)
bis t2, $at, t2 // t2 = quadword 2 (source 2)
ldq a4, -56(a0) // a4 = quadword 2 (source 1)
extql t3, a1, t3 // bytes from unaligned quad 3
extqh t4, a1, $at // bytes from unaligned quad 4
ldq a5, -48(a0) // a5 = quadword 3 (source 1)
bis t3, $at, t3 // t3 = quadword 3 (source 2)
extql t4, a1, t4 // bytes from unaligned quad 4
extqh t5, a1, $at // bytes from unaligned quad 5
subq t0, 1, t0 // decrement blocks to compare
bis t4, $at, t4 // t4 = quadword 4 (source 2)
xor t1, a3, $at // match on quadword 1?
ldq a3, -40(a0) // a3 = quadword 4 (source 1)
bne $at, 200f // if ne, miscompare quad 1
xor t2, a4, $at // match on quadword 2?
ldq_u t2, 39(a1) // get source 2 unaligned quad 6
bne $at, 122f // if ne, miscompare quad 2
xor t3, a5, $at // match on quadword 3?
ldq_u t3, 47(a1) // get source 2 unaligned quad 7
bne $at, 124f // if ne, miscompare quad 3
xor t4, a3, $at // match on quadword 4?
ldq_u t4, 55(a1) // get source 2 unaligned quad 8
bne $at, 126f // if ne, miscompare quad 4
ldq_u t1, 63(a1) // get source 2 unaligned quad 9
ldq a3, -32(a0) // a3 = quadword 5 (source 1)
extql t5, a1, t5 // bytes from unaligned quad 5
extqh t2, a1, $at // bytes from unaligned quad 6
ldq a4, -24(a0) // a4 = quadword 6 (source 1)
ldq a5, -16(a0) // a5 = quadword 7 (source 1)
bis t5, $at, t5 // t5 = quadword 5 (source 2)
xor t5, a3, $at // match on quadword 5?
ldq a3, -8(a0) // a3 = quadword 8 (source 1)
bne $at, 130f // if ne, miscompare quad 5
extql t2, a1, t2 // bytes from unaligned quad 6
extqh t3, a1, $at // bytes from unaligned quad 7
extql t3, a1, t3 // bytes from unaligned quad 7
bis t2, $at, t2 // t2 = quadword 6 (source 2)
xor t2, a4, $at // match on quadword 6?
bne $at, 132f // if ne, miscompare quad 6
extqh t4, a1, $at // bytes from unaligned quad 8
extql t4, a1, t4 // bytes from unaligned quad 8
bis t3, $at, t3 // t3 = quadword 7 (source 2)
xor t3, a5, $at // match on quadword 7?
bne $at, 134f // if ne, miscompare quad 7
extqh t1, a1, $at // bytes from unaligned quad 9
addq a1, 64, a1 // increment source 2 pointer
bis t4, $at, t4 // t4 = quadword 8 (source 2)
xor t4, a3, $at // match on quadword 8?
bne $at, 136f // if ne, miscompare quad 8
subq a2, 64, a2 // decrement number of bytes to compare
bne t0, 40b // if ne, more blocks to compare
.set at
//
// Compare quadwords
//
50:
srl a2, 3, t0 // t0 = number of quads to compare
beq t0, 70f // if eq, no quads to compare
ldq_u t1, 0(a1) // get unaligned quad 1 (source 2)
.set noat
60:
ldq_u t2, 7(a1) // get unaligned quad 2 (source 2)
ldq t3, 0(a0) // t3 = quadword 1 (source 1)
extql t1, a1, t1 // get bytes from unaligned quad 1
extqh t2, a1, $at // get bytes from unaligned quad 2
addq a1, 8, a1 // increment source 2 pointer
bis t1, $at, t1 // t1 = quadword 1 (source 2)
xor t1, t3, $at // match on quadword?
bne $at, 200f // if ne, miscompare
subq t0, 1, t0 // decrement quadwords to compare
addq a0, 8, a0 // increment source 1 pointer
subq a2, 8, a2 // decrement bytes to compare
bis t2, zero, t1 // save low quadword for next loop
bne t0, 60b // if ne, more quads to compare
.set at
//
// Compare bytes for final quadword
//
70:
beq a2, 90f // if eq, comparison complete
ldq t1, 0(a0) // get quadword from source 1
bis zero, zero, t0 // t0 = byte position to compare
.set noat
80:
ldq_u t2, 0(a1) // get unaligned quad from source 2
extbl t1, t0, t3 // t3 = byte from source 1
extbl t2, a1, t2 // t2 = byte from source 2
xor t3, t2, $at // match on byte?
bne $at, 100f // if ne, miscompare on byte
addq t0, 1, t0 // increment byte position
addq a1, 1, a1 // increment source 2 pointer
subq a2, 1, a2 // decrement bytes to compare
bne a2, 80b // if ne, more bytes to compare
.set at
//
// Successful full comparison
//
90:
ret zero, (ra) // return, v0 already set
//
// Miscompare on last quadword
//
100:
subq v0, a2, v0 // subtract bytes not compared
ret zero, (ra) // return
//
// Miscompare on first quadword, unaligned case
//
// v0 = total bytes to compare
// a2 = bytes remaining to compare
//
110:
subq v0, a2, v0 // bytes compared successfully
ret zero, (ra) // return
//
// Miscompare on 64-byte block compare
//
122:
subq a2, 8, a2 // miscompare on quad 2
br zero, 200f // finish in common code
124:
subq a2, 16, a2 // miscompare on quad 3
br zero, 200f // finish in common code
126:
subq a2, 24, a2 // miscompare on quad 4
br zero, 200f // finish in common code
130:
subq a2, 32, a2 // miscompare on quad 5
br zero, 200f // finish in common code
132:
subq a2, 40, a2 // miscompare on quad 6
br zero, 200f // finish in common code
134:
subq a2, 48, a2 // miscompare on quad 7
br zero, 200f // finish in common code
136:
subq a2, 56, a2 // miscompare on quad 8
br zero, 200f // finish in common code
//
// Miscompare, determine number of bytes that successfully compared
// $at = xor of relevant quads from sources, must be non-zero
// a2 = number of bytes left to compare
//
.set noat
200:
cmpbge zero, $at, $at // $at = mask of non-zero bytes
//
// look for the first bit cleared in $at, this is the
// number of the first byte which differed
//
bis zero, zero, t0 // bit position to look for clear
210:
blbc $at, 220f // if low clear, found difference
srl $at, 1, $at // check next bit
addq t0, 1, t0 // count bit position checked
br zero, 210b
220:
subq v0, a2, v0 // subtract bytes yet to compare
addq v0, t0, v0 // add bytes that matched on last quad
ret zero, (ra)
.set at
.end RtlCompareMemory
SBTTL("Move Memory")
//++
//
// VOID
// RtlMoveMemory (
// IN PVOID Destination,
// IN PVOID Source,
// IN ULONG Length
// )
//
// Routine Description:
//
// This function moves memory either forward or backward, aligned or
// unaligned, in 64-byte blocks, followed by 8-byte blocks, followed
// by any remaining bytes.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the destination address of
// the move operation.
//
// Source (a1) - Supplies a pointer to the source address of the move
// operation.
//
// Length (a2) - Supplies the length, in bytes, of the memory to be moved.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(RtlMoveMemory)
beq a2, 80f // if eq, no bytes to move
//
// If the source address is less than the destination address and source
// address plus the length of the move is greater than the destination
// address, then the source and destination overlap such that the move
// must be performed backwards.
//
cmpult a0, a1, t0 // is destination less than source
bne t0, MoveForward // if eq [true] no overlap possible
addq a1, a2, t0 // compute source ending address
cmpult t0, a0, t1 // is source end less than dest.
beq t1, MoveBackward // if eq [false], overlap
//
// Move memory forward aligned and unaligned.
//
MoveForward: //
xor a0, a1, t0 // compare alignment bits
and t0, 0x7, t0 // isloate alignment comparison
bne t0, MoveForwardUnaligned // if ne, incompatible alignment
//
// Move memory forward aligned.
//
MoveForwardAligned: //
//
// Move bytes until source and destination are quadword aligned
//
and a0, 0x7, t0 // t0 = unaligned bits
bne t0, 5f // if ne, not quad aligned
br zero, 20f // predicted taken
5:
ldq_u t2, 0(a0) // get unaligned quad from dest.
ldq_u t1, 0(a1) // get unaligned quadword from source
10:
beq a2, 15f // if eq, all bytes moved
extbl t1, t0, t3 // t3 = byte from source
insbl t3, t0, t3 // t3 = byte from source, in position
mskbl t2, t0, t2 // clear position in dest. quad
bis t2, t3, t2 // merge in byte from source
subq a2, 1, a2 // decrement bytes to move
addq t0, 1, t0 // increment byte within quad
cmpeq t0, 8, t3 // finished the quadword?
beq t3, 10b // if eq [false], do next byte
15:
stq_u t2, 0(a0) // store merged destination bytes
addq a0, 7, a0 // move to next quadword
bic a0, 7, a0 // aligned quadword
addq a1, 7, a1 // move to next quadword
bic a1, 7, a1 // aligned quadword
//
// Check for 64-byte block moves
//
20:
srl a2, 6, t0 // t0 = number of 64 byte blocks
beq t0, 40f // if eq no blocks to move
and a2, 64-1, a2 // a2 = residual bytes
30:
ldq t1, 0(a1) // load 64 bytes from source
addq a0, 64, a0 // increment destination pointer
ldq v0, 56(a1) //
ldq a3, 32(a1) //
stq t1, -64(a0) // write to destination
ldq t2, 8(a1) // into volatile registers
ldq t3, 16(a1) //
ldq t4, 24(a1) //
subq t0, 1, t0 // decrement number of blocks
stq t2, -56(a0) //
ldq a4, 40(a1) //
stq t3, -48(a0) //
ldq a5, 48(a1) //
stq t4, -40(a0) //
addq a1, 64, a1 // increment source pointer
stq a3, -32(a0) //
stq a4, -24(a0) //
stq a5, -16(a0) //
stq v0, -8(a0) //
bne t0, 30b // if ne, more blocks to copy
//
// Copy quadwords
//
40:
srl a2, 3, t0 // t0 = number of quadwords to move
beq t0, 60f // if eq no quadwords to move
and a2, 8-1, a2 // a2 = residual bytes
50:
ldq t1, 0(a1) // load quadword from source
addq a1, 8, a1 // increment source pointer
stq t1, 0(a0) // store quadword to destination
addq a0, 8, a0 // increment destination pointer
subq t0, 1, t0 // decrement number of quadwords
bne t0, 50b // if ne, more quadwords to move
//
// Move final residual bytes
//
60:
beq a2, 80f // if eq, no more bytes to move
ldq t1, 0(a1) // get last source quadword
ldq t2, 0(a0) // get last dest. quadword
bis zero, zero, t0 // t0 = next byte number to move
70:
extbl t1, t0, t3 // extract byte from source
insbl t3, t0, t3 // t3 = source byte, in position
mskbl t2, t0, t2 // clear byte position for dest.
bis t2, t3, t2 // merge in source byte
addq t0, 1, t0 // increment byte position
subq a2, 1, a2 // decrement bytes to move
bne a2, 70b // if ne => more bytes to move
stq t2, 0(a0) // store merged data
//
// Finish aligned MoveForward
//
80:
ret zero, (ra) // return
//
// Move memory forward unaligned.
//
MoveForwardUnaligned: //
//
// Move bytes until the destination is aligned
//
and a0, 0x7, t0 // t0 = unaligned bits
beq t0, 100f // if eq, destination quad aligned
ldq_u t2, 0(a0) // get unaligned quad from dest
90:
beq a2, 95f // if eq no more bytes to move
ldq_u t1, 0(a1) // get unaligned quad from source
extbl t1, a1, t1 // extract source byte
insbl t1, t0, t1 // t1 = source byte, in position
mskbl t2, t0, t2 // clear byte position in dest.
bis t2, t1, t2 // merge in source byte
addq t0, 1, t0 // increment byte position
addq a1, 1, a1 // increment source pointer
subq a2, 1, a2 // decrement bytes to move
cmpeq t0, 8, t3 // t0 = 8? => quad finished
beq t3, 90b // if eq [false], more bytes to move
95:
stq_u t2, 0(a0) // store merged quadword
addq a0, 7, a0 // increment to next quad
bic a0, 7, a0 // align next quadword
//
// Check for 64-byte blocks to move
//
100:
srl a2, 6, t0 // t0 = number of blocks to move
beq t0, 120f // if eq no blocks to move
and a2, 64-1, a2 // a2 = residual bytes to move
ldq_u t1, 0(a1) // t1 = first unaligned quad
110:
// get source data and merge it
// as we go
ldq_u t2, 7(a1) // t2 = second unaligned quad
extql t1, a1, t1 // extract applicable bytes from t1
extqh t2, a1, v0 // extract applicable bytes from t2
bis t1, v0, t1 // t1 = quad #1
ldq_u t3, 15(a1) // t3 = third unaligned quad
extql t2, a1, t2 // extract applicable bytes from t2
extqh t3, a1, v0 // extract applicable bytes from t3
stq t1, 0(a0) // store quad #1
bis t2, v0, t2 // t2 = quad #2
ldq_u t4, 23(a1) // t4 = fourth unaligned quad
extql t3, a1, t3 // extract applicable bytes from t3
extqh t4, a1, v0 // extract applicable bytes from t4
stq t2, 8(a0) // store quad #2
bis t3, v0, t3 // t3 = quad #3
ldq_u t5, 31(a1) // t5 = fifth unaligned quad
extql t4, a1, t4 // extract applicable bytes from t4
extqh t5, a1, v0 // extract applicable bytes from t5
stq t3, 16(a0) // store quad #3
bis t4, v0, t4 // t4 = quad #4
ldq_u a3, 39(a1) // a3 = sixth unaligned quad
extql t5, a1, t5 // extract applicable bytes from t5
extqh a3, a1, v0 // extract applicable bytes from a3
stq t4, 24(a0) // store quad #4
bis t5, v0, t5 // t5 = quad #5
ldq_u a4, 47(a1) // a4 = seventh unaligned quad
extql a3, a1, a3 // extract applicable bytes from a3
extqh a4, a1, v0 // extract applicable bytes from a4
stq t5, 32(a0) // store quad #5
bis a3, v0, a3 // a3 = quad #6
ldq_u a5, 55(a1) // a5 = eighth unaligned quad
extql a4, a1, a4 // extract applicable bytes from a4
extqh a5, a1, v0 // extract applicable bytes from a5
stq a3, 40(a0) // store quad #6
bis a4, v0, a4 // a4 = quad #7
ldq_u t1, 63(a1) // t1 = ninth unaligned = 1st of next
extql a5, a1, a5 // extract applicable bytes from a5
extqh t1, a1, v0 // extract applicable bytes from t1
stq a4, 48(a0) // store quad #7
bis a5, v0, a5 // a5 = quad #8
addq a1, 64, a1 // increment source pointer
stq a5, 56(a0) // store quad #8
addq a0, 64, a0 // increment destination pointer
subq t0, 1, t0 // decrement number of blocks
bne t0, 110b // if ne, more blocks to move
//
// Move unaligned source quads to aligned destination quads
//
120:
srl a2, 3, t0 // t0 = number of quads to move
beq t0, 140f // if eq no quads to move
and a2, 8-1, a2 // a2 = residual bytes
ldq_u t1, 0(a1) // t1 = first unaligned quad
130:
ldq_u t2, 7(a1) // t2 = second unaligned quad
addq a0, 8, a0 // increment destination pointer
extql t1, a1, t1 // extract applicable bytes from t1
extqh t2, a1, v0 // extract applicable bytes from t2
bis t1, v0, t1 // t1 = quadword of data
stq t1, -8(a0) // store data to destination
addq a1, 8, a1 // increment source pointer
subq t0, 1, t0 // decrement quads to move
bis t2, zero, t1 // t1 = first of next unaligned pair
bne t0, 130b // if ne, more quads to move
//
// Move remaining bytes to final quadword
//
140:
beq a2, 160f // if eq no more bytes to move
ldq t2, 0(a0) // t2 = destination quadword
bis zero, zero, t3 // t3 = position for next insertion
150:
ldq_u t1, 0(a1) // get unaligned source quad
extbl t1, a1, t1 // t1 = source byte
insbl t1, t3, t1 // t1 = source byte, in position
mskbl t2, t3, t2 // clear byte in destination
bis t2, t1, t2 // merge in source byte
addq a1, 1, a1 // increment source pointer
subq a2, 1, a2 // decrement bytes to move
addq t3, 1, t3 // increment destination position
bne a2, 150b // more bytes to move
stq t2, 0(a0) // store merged data
//
// Finish unaligned MoveForward
//
160:
ret zero, (ra) // return
//
// Move memory backward.
//
MoveBackward: //
addq a0, a2, a0 // compute ending destination address
addq a1, a2, a1 // compute ending source address
subq a0, 1, a0 // point to last destination byte
subq a1, 1, a1 // point to last source byte
xor a0, a1, t0 // compare alignment bits
and t0, 0x7, t0 // isolate alignment comparison
bne t0, MoveBackwardUnaligned // if ne, incompatible alignment
//
// Move memory backward aligned.
//
MoveBackwardAligned: //
//
// Move bytes until source and destination are quadword aligned
//
and a0, 0x7, t0 // t0 = unaligned bits
cmpeq t0, 7, t1 // last byte position 7?
beq t1, 5f // if eq [false], not quad aligned
subq a0, 7, a0 // point to beginning of last quad
subq a1, 7, a1 // point to beginning of last quad
br zero, 30f // predicted taken
5:
ldq_u t1, 0(a0) // get unaligned quad from dest.
ldq_u t2, 0(a1) // get unaligned quad from source
10:
beq a2, 20f // if eq, all bytes moved
extbl t2, t0, t3 // t3 = byte from source
insbl t3, t0, t3 // t3 = byte from source, in position
mskbl t1, t0, t1 // clear position in destination
bis t1, t3, t1 // merge in byte from source
subq a2, 1, a2 // decrement bytes to move
subq t0, 1, t0 // decrement byte within quadword
cmplt t0, zero, t3 // finished the quadword?
beq t3, 10b // if eq [false], do next byte
20:
stq_u t1, 0(a0) // store merged destination bytes
subq a0, 8, a0 // move to previous quadword
bic a0, 7, a0 // aligned quadword
subq a1, 8, a1 // move to previous quadword
bic a1, 7, a1 // aligned quadword
//
// Check for 64-byte block moves
//
30:
srl a2, 6, t0 // t0 = number of 64 byte blocks
beq t0, 50f // if eq, no blocks to move
and a2, 64-1, a2 // a2 = residual bytes
40:
ldq t1, 0(a1) // load 64 bytes from source into
subq a0, 64, a0 // decrement destination pointer
ldq v0, -56(a1) //
ldq a3, -32(a1) //
stq t1, 64(a0) // write to destination
ldq t2, -8(a1) // into volatile registers
ldq a5, -48(a1) //
ldq a4, -40(a1) //
stq t2, 56(a0) //
ldq t3, -16(a1) //
ldq t4, -24(a1) //
subq a1, 64, a1 // decrement source pointer
stq t3, 48(a0) //
stq t4, 40(a0) //
stq a3, 32(a0) //
subq t0, 1, t0 // decrement number of blocks
stq a4, 24(a0) //
stq a5, 16(a0) //
stq v0, 8(a0) //
bne t0, 40b // if ne, more blocks to copy
//
// Copy quadwords
//
50:
srl a2, 3, t0 // t0 = number of quadwords to move
beq t0, 70f // if eq no quadwords to move
and a2, 8-1, a2 // a2 = residual bytes
60:
ldq t1, 0(a1) // load quadword from source
subq a1, 8, a1 // decrement source pointer
stq t1, 0(a0) // store quadword to destination
subq a0, 8, a0 // decrement destination pointer
subq t0, 1, t0 // decrement quadwords to move
bne t0, 60b // if ne, more quadwords to move
//
// Move final residual bytes
//
70:
beq a2, 90f // if eq, no more bytes to move
ldq t1, 0(a1) // get last source quadword
ldq t2, 0(a0) // get last destination quadword
bis zero, 7, t0 // t0 = next byte number to move
80:
extbl t1, t0, t3 // extract byte from source
insbl t3, t0, t3 // t3 = source byte, in position
mskbl t2, t0, t2 // clear byte position for dest.
bis t2, t3, t2 // merge in source byte
subq t0, 1, t0 // decrement byte position
subq a2, 1, a2 // decrement bytes to move
bne a2, 80b // if ne, more bytes to move
stq t2, 0(a0) // write destination data
//
// Finish aligned MoveBackward
//
90:
ret zero, (ra) // return
//
// Move memory backward unaligned.
//
MoveBackwardUnaligned: //
//
// Move bytes until the destination is aligned
//
and a0, 0x7, t0 // t0 = unaligned bits
cmpeq t0, 7, t1 // last byte of a quadword
beq t1, 95f // if eq[false], not aligned
subq a0, 7, a0 // align pointer to beginning of quad
br zero, 120f //
95:
ldq_u t2, 0(a0) // get unaligned quad from dest.
100:
beq a2, 110f // if eq, no more bytes to move
ldq_u t1, 0(a1) // get unaligned quad from source
extbl t1, a1, t1 // extract source byte
insbl t1, t0, t1 // t1 = source byte in position
mskbl t2, t0, t2 // clear byte position in dest.
bis t2, t1, t2 // merge source byte
subq t0, 1, t0 // decrement byte position
subq a1, 1, a1 // decrement source pointer
subq a2, 1, a2 // decrement number of bytes to move
cmplt t0, zero, t3 // t0 < 0? => quad finished
beq t3, 100b // if eq [false], more bytes to move
110:
stq_u t2, 0(a0) // store merged quadword
subq a0, 8, a0 // decrement dest. to previous quad
bic a0, 7, a0 // align previous quadword
//
// Check for 64-byte blocks to move
//
120:
srl a2, 6, t0 // t0 = number of blocks to move
subq a1, 7, a1 // point to beginning of last quad
beq t0, 140f // if eq no blocks to move
and a2, 64-1, a2 // a2 = residual bytes to move
ldq_u t1, 7(a1) // t1 = first unaligned quad
130:
// get source data and merge it
// as we go
ldq_u t2, 0(a1) // t2 = second unaligned quad
extqh t1, a1, t1 // extract applicable bytes from t1
extql t2, a1, v0 // extract applicable bytes from t2
bis t1, v0, t1 // t1 = quad #1
ldq_u t3, -8(a1) // t3 = third unaligned quad
extqh t2, a1, t2 // extract applicable bytes from t2
extql t3, a1, v0 // extract applicable bytes from t3
stq t1, 0(a0) // store quad #1
bis t2, v0, t2 // t2 = quad #2
ldq_u t4, -16(a1) // t4 = fourth unaligned quad
extqh t3, a1, t3 // extract applicable bytes from t3
extql t4, a1, v0 // extract applicable bytes from t4
stq t2, -8(a0) // store quad #2
bis t3, v0, t3 // t3 = quad #3
ldq_u t5, -24(a1) // t5 = fifth unaligned quad
extqh t4, a1, t4 // extract applicable bytes from t4
extql t5, a1, v0 // extract applicable bytes from t5
stq t3, -16(a0) // store quad #3
bis t4, v0, t4 // t4 = quad #4
ldq_u a3, -32(a1) // a3 = sixth unaligned quad
extqh t5, a1, t5 // extract applicable bytes from t5
extql a3, a1, v0 // extract applicable bytes from a3
stq t4, -24(a0) // store quad #4
bis t5, v0, t5 // t5 = quad #5
ldq_u a4, -40(a1) // a4 = seventh unaligned quad
extqh a3, a1, a3 // extract applicable bytes from a3
extql a4, a1, v0 // extract applicable bytes from a4
stq t5, -32(a0) // store quad #5
bis a3, v0, a3 // a3 = quad #6
ldq_u a5, -48(a1) // a5 = eighth unaligned quad
extqh a4, a1, a4 // extract applicable bytes from a4
extql a5, a1, v0 // extract applicable bytes from a5
stq a3, -40(a0) // store quad #6
bis a4, v0, a4 // a4 = quad #7
ldq_u t1, -56(a1) // t1 = ninth unaligned = 1st of next
extqh a5, a1, a5 // extract applicable bytes from a5
extql t1, a1, v0 // extract applicable bytes from t1
stq a4, -48(a0) // store quad #7
bis a5, v0, a5 // a5 = quad #8
subq a1, 64, a1 // increment source pointer
stq a5, -56(a0) // store quad #8
subq a0, 64, a0 // increment destination pointer
subq t0, 1, t0 // decrement number of blocks
bne t0, 130b // if ne, more blocks to move
//
// Move unaligned source quads to aligned destination quads
//
140:
srl a2, 3, t0 // t0 = number of quads to move
beq t0, 160f // if eq no quads to move
and a2, 8-1, a2 // a2 = residual bytes
ldq_u t1, 7(a1) // t1 = first unaligned quad
150:
ldq_u t2, 0(a1) // t2 = second unaligned quad
subq a0, 8, a0 // decrement destination pointer
extqh t1, a1, t1 // extract applicable bytes from t1
extql t2, a1, v0 // extract applicable bytes from t2
bis t1, v0, t1 // t1 = quadword of data
stq t1, 8(a0) // store data to destination
subq a1, 8, a1 // decrement source pointer
subq t0, 1, t0 // decrement quads to move
bis t2, zero, t1 // t1 = first of next unaligned pair
bne t0, 150b // if ne, more quads to move
//
// Move remaining bytes to final quadword
//
160:
beq a2, 180f // if eq, no more bytes to move
ldq t2, 0(a0) // t2 = destination quadword
bis zero, 7, t0 // t0 = position for next insertion
170:
subq a1, 1, a1 // decrement source pointer
ldq_u t1, 8(a1) // get unaligned source quad
extbl t1, a1, t1 // t1 = source byte
insbl t1, t0, t1 // t1 = source byte, in position
mskbl t2, t0, t2 // clear byte position
bis t2, t1, t2 // merge in source byte
subq t0, 1, t0 // decrement byte position for dest.
subq a2, 1, a2 // decrement bytes to move
bne a2, 170b // if ne, more bytes to move
stq t2, 0(a0) //
//
// Finish unaligned MoveBackward
//
180:
ret zero, (ra) // return
.end RtlMoveMemory
SBTTL("Zero Memory")
//++
//
// VOID
// RtlZeroMemory (
// IN PVOID Destination,
// IN ULONG Length
// )
//
// Routine Description:
//
// This function zeros memory by first aligning the destination address to
// a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte
// blocks, followed by any remaining bytes.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the memory to zero.
//
// Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(RtlZeroMemory)
bis zero, zero, a2 // set fill pattern
br zero, RtlpFillMemory //
SBTTL("Fill Memory")
//++
//
// VOID
// RtlFillMemory (
// IN PVOID Destination,
// IN ULONG Length,
// IN UCHAR Fill
// )
//
// Routine Description:
//
// This function fills memory by first aligning the destination address to
// a longword boundary, and then filling 32-byte blocks, followed by 4-byte
// blocks, followed by any remaining bytes.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the memory to fill.
//
// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
//
// Fill (a2) - Supplies the fill byte.
//
// N.B. The alternate entry memset expects the length and fill arguments
// to be reversed. It also returns the Destination pointer
//
// Return Value:
//
// None.
//
//--
ALTERNATE_ENTRY(memset)
bis a0, zero, v0 // set return value
bis a1, zero, a3 // swap length and fill arguments
bis a2, zero, a1 //
bis a3, zero, a2 //
ALTERNATE_ENTRY(RtlFillMemory)
and a2, 0xff, a2 // clear excess bits
sll a2, 8, t0 // duplicate fill byte
bis a2, t0, a2 // generate fill word
sll a2, 16, t0 // duplicate fill word
bis a2, t0, a2 // generate fill longword
sll a2, 32, t0 // duplicate fill longword
bis a2, t0, a2 // generate fill quadword
.align 3 // ensure quadword aligned target
//
// Fill memory with the pattern specified in register a2.
//
RtlpFillMemory: //
//
// Align destination to quadword
//
beq a1, 80f // anything to fill? (paranoia)
and a0, 8-1, t0 // t0 = unaligned bits
bne t0, 5f // if ne, then not quad aligned
br zero, 20f // if eq, then quad aligned
5:
ldq_u t1, 0(a0) // get unaligned quadword
// for first group of bytes
10:
beq a1, 15f // if eq no more bytes to fill
insbl a2, t0, t2 // get fill byte into position
mskbl t1, t0, t1 // clear byte for fill
bis t1, t2, t1 // put in fill byte
addq t0, 1, t0 // increment to next byte position
subq a1, 1, a1 // decrement bytes to fill
cmpeq t0, 8, t2 // t0 = 8?
beq t2, 10b // if eq [false] more bytes to do
15:
stq_u t1, 0(a0) // store modified bytes
addq a0, 7, a0 // move a0 to next quadword
bic a0, 7, a0 // align a0 to quadword
//
// Check for 64-byte blocks
//
20:
srl a1, 6, t0 // t0 = number of 64 byte blocks
beq t0, 40f // if eq then no 64 byte blocks
and a1, 64-1, a1 // a1 = residual bytes to fill
30:
stq a2, 0(a0) // store 64 bytes
stq a2, 8(a0) //
stq a2, 16(a0) //
stq a2, 24(a0) //
stq a2, 32(a0) //
stq a2, 40(a0) //
stq a2, 48(a0) //
stq a2, 56(a0) //
subq t0, 1, t0 // decrement blocks remaining
addq a0, 64, a0 // increment destination pointer
bne t0, 30b // more blocks to write
//
// Fill aligned quadwords
//
40:
srl a1, 3, t0 // t0 = number of quadwords
bne t0, 55f // if ne quadwords left to fill
br zero, 60f // if eq no quadwords left
55:
and a1, 8-1, a1 // a1 = residual bytes to fill
50:
stq a2, 0(a0) // store quadword
subq t0, 1, t0 // decrement quadwords remaining
addq a0, 8, a0 // next quadword
bne t0, 50b // more quadwords to write
//
// Fill bytes for last quadword
//
60:
bne a1, 65f // if ne bytes remain to be filled
br zero, 80f // if eq no more bytes to fill
65:
ldq t1, 0(a0) // get last quadword
bis zero, zero, t0 // t0 = byte position to start fill
70:
beq a1, 75f // if eq, no more bytes to fill
insbl a2, t0, t2 // get fill byte into position
mskbl t1, t0, t1 // clear fill byte position
bis t1, t2, t1 // insert fill byte
addq t0, 1, t0 // increment byte within quad
subq a1, 1, a1 // decrement bytes to fill
cmpeq t0, 8, t3 // t0 = 8? => finished quad
beq t3, 70b // if eq [false] more bytes to fill
75:
stq t1, 0(a0) // write merged quadword
//
// Finish up
//
80:
ret zero, (ra) // return
.end RtlZeroMemory
SBTTL("Fill Memory Ulong")
//++
//
// VOID
// RtlFillMemoryUlong (
// IN PVOID Destination,
// IN ULONG Length,
// IN ULONG Pattern
// )
//
// Routine Description:
//
// This function fills memory with the specified longowrd pattern by
// filling 64-byte blocks followed by 8-byte blocks and finally
// 4-byte blocks.
//
// N.B. This routine assumes that the destination address is aligned
// on a longword boundary and that the length is an even multiple
// of longwords.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the memory to fill.
//
// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
//
// Pattern (a2) - Supplies the fill pattern.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(RtlFillMemoryUlong)
bic a1, 3, a1 // make sure length is an even number
// of longwords
sll a2, 32, a3 // a3 = long pattern in upper 32 bits
srl a3, 32, t0 // clear upper bits, pattern in lower 32
bis a3, t0, a3 // a3 = quad version of fill pattern
//
// Make destination address quad-aligned
//
and a0, 4, t0 // is a0 quad aligned?
beq t0, 10f // if eq, then a0 quad aligned
stl a2, 0(a0) // fill first longword
addq a0, 4, a0 // quad align a0
subq a1, 4, a1 // bytes remaining to store
//
// Check for 64-byte blocks to fill
//
10:
srl a1, 6, t0 // t0 = # 64-byte blocks to fill
beq t0, 30f // if eq no 64 byte blocks
and a1, 64-1, a1 // a1 = residual bytes
20:
stq a3, 0(a0) // store 64 bytes
stq a3, 8(a0) //
stq a3, 16(a0) //
stq a3, 24(a0) //
stq a3, 32(a0) //
stq a3, 40(a0) //
stq a3, 48(a0) //
stq a3, 56(a0) //
subq t0, 1, t0 // t0 = blocks remaining
addq a0, 64, a0 // increment address pointer
bne t0, 20b // if ne more blocks to fill
//
// Fill 8 bytes at a time while we can, a1 = bytes remaining
//
30:
srl a1, 3, t0 // t0 = # quadwords to fill
beq t0, 50f // if eq no quadwords left
and a1, 8-1, a1 // a1 = residual bytes
40:
stq a3, 0(a0) // store quadword
subq t0, 1, t0 // t0 = quadwords remaining
addq a0, 8, a0 // increment address pointer
bne t0, 40b // if ne more quadwords to fill
//
// Fill last 4 bytes
//
50:
beq a1, 60f // if eq no longwords remain
stl a2, 0(a0) // fill last longword
//
// Finish up
//
60:
ret zero, (ra) // return to caller
.end RtlFillMemoryUlong
SBTTL("Copy Memory With Byte Granularity")
//++
//
// VOID
// RtlCopyBytes (
// IN PVOID Destination,
// IN PVOID Source,
// IN ULONG Length
// )
//
// Routine Description:
//
// This function copies non-overlapping memory, aligned or unaligned, in
// 64-byte blocks, followed by 8-byte blocks, followed by any remaining
// bytes. Unlike RtlCopyMemory or RtlMoveMemory the copy is done such
// that byte granularity is assured for all platforms.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the destination address of
// the move operation.
//
// Source (a1) - Supplies a pointer to the source address of the move
// operation.
//
// Length (a2) - Supplies the length, in bytes, of the memory to be moved.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(RtlCopyBytes)
//
// Move memory forward aligned and unaligned.
//
xor a0, a1, t0 // compare alignment bits
and t0, 0x7, t0 // isolate alignment comparison
bne t0, CopyForwardUnaligned // if ne, incompatible alignment
//
// Source and Destination buffers have the same alignment. Move
// bytes until done or source and destination are quadword aligned
//
and a0, 0x7, t0 // t0 = unaligned bits
bne t0, 5f // if ne, not quad aligned
br zero, 20f // predicted taken
5:
bis zero, zero, t1 // t4 = destination byte zap mask
bis zero, 1, t2
sll t2, t0, t2 // t2 = next bit to set in zap mask
10:
beq a2, 15f // if eq, all bits set
bis t1, t2, t1 // set bit in zap mask
sll t2, 1, t2 // set next higher bit for zap mask
subq a2, 1, a2 // decrement bytes to move
addq t0, 1, t0 // increment byte within quad
cmpeq t0, 8, t3 // finished the quadword?
beq t3, 10b // if eq [false], do next byte
15:
ldq_u t2, 0(a1) // get unaligned quadword from source
zapnot t2, t1, t2 // clear source bytes
bic a0, 7, a3 // a3 = quadword base of destination
retry1:
ldq_l t0, 0(a3) // load destination quadword
zap t0, t1, t0 // clear destination bytes
or t0, t2, t0 // merge in bytes from source
stq_c t0, 0(a3) // store merged quadword conditional
beq t0, retry1f // if eq, retry failed interlock
addq a0, 7, a0 // move to next quadword
bic a0, 7, a0 // aligned quadword
addq a1, 7, a1 // move to next quadword
bic a1, 7, a1 // aligned quadword
//
// Check for 64-byte block moves
//
20:
srl a2, 6, t0 // t0 = number of 64 byte blocks
beq t0, 40f // if eq no blocks to move
and a2, 64-1, a2 // a2 = residual bytes
30:
ldq t1, 0(a1) // load 64 bytes from source
addq a0, 64, a0 // increment destination pointer
ldq v0, 56(a1) //
ldq a3, 32(a1) //
stq t1, -64(a0) // write to destination
ldq t2, 8(a1) // into volatile registers
ldq t3, 16(a1) //
ldq t4, 24(a1) //
subq t0, 1, t0 // decrement number of blocks
stq t2, -56(a0) //
ldq a4, 40(a1) //
stq t3, -48(a0) //
ldq a5, 48(a1) //
stq t4, -40(a0) //
addq a1, 64, a1 // increment source pointer
stq a3, -32(a0) //
stq a4, -24(a0) //
stq a5, -16(a0) //
stq v0, -8(a0) //
bne t0, 30b // if ne, more blocks to copy
//
// Copy quadwords
//
40:
srl a2, 3, t0 // t0 = number of quadwords to move
beq t0, 60f // if eq no quadwords to move
and a2, 8-1, a2 // a2 = residual bytes
50:
ldq t1, 0(a1) // load quadword from source
addq a1, 8, a1 // increment source pointer
stq t1, 0(a0) // store quadword to destination
addq a0, 8, a0 // increment destination pointer
subq t0, 1, t0 // decrement number of quadwords
bne t0, 50b // if ne, more quadwords to move
//
// Move final residual bytes
//
60:
beq a2, 80f // if eq, no more bytes to move
mov a2, t0 // t0 = number of bytes to move
mov -1, t1 // t1 = bit mask
sll t0, 3, t0 // # of bytes to # of bits
srl t1, t0, t1 // clear t0 bits
sll t1, t0, t0 // move it back
ldq t1, 0(a1) // get last source quadword
bic t1, t0, t1 // clear bytes not copied
not t0, t0 // complement to clear destination
retry2:
ldq_l t2, 0(a0) // get last destination quadword locked
bic t2, t0, t2 // clear bytes to be copied
bis t2, t1, t2 // move bytes from source
stq_c t2, 0(a0) // store merged quadword conditional
beq t2, retry2f // if eq, retry failed interlock
//
// Finish aligned MoveForward
//
80:
ret zero, (ra) // return
//
// Move memory forward unaligned.
//
CopyForwardUnaligned: //
//
// Move bytes until the destination is aligned
//
and a0, 0x7, t0 // t0 = unaligned bits
beq t0, 100f // if eq, destination quad aligned
bis zero, zero, t1 // t4 = destination byte zap mask
bis zero, 1, t2
sll t2, t0, t2 // t2 = next bit to set in zap mask
mov zero, t4 // assemble destination bytes here
90:
beq a2, 95f // if eq no more bytes to move
bis t1, t2, t1 // set bit in zap mask
sll t2, 1, t2 // set next higher bit for zap mask
ldq_u t5, 0(a1) // get unaligned quad from source
extbl t5, a1, t5 // extract source byte
insbl t5, t0, t5 // t5 = source byte, in position
or t4, t5, t4 // merge in source byte
addq t0, 1, t0 // increment byte position
addq a1, 1, a1 // increment source pointer
subq a2, 1, a2 // decrement bytes to move
cmpeq t0, 8, t3 // t0 = 8? => quad finished
beq t3, 90b // if eq [false], more bytes to move
95:
bic a0, 0x7, a3 // a3 = quadword base of destination
retry3:
ldq_l t0, 0(a3) // load destination quadword
zap t0, t1, t0 // clear destination bytes
or t0, t4, t0 // merge in bytes from source
stq_c t0, 0(a3) // store merged quadword conditional
beq t0, retry3f // if eq, retry failed interlock
addq a0, 7, a0 // increment to next quad
bic a0, 7, a0 // align next quadword
//
// Check for 64-byte blocks to move
//
100:
srl a2, 6, t0 // t0 = number of blocks to move
beq t0, 120f // if eq no blocks to move
and a2, 64-1, a2 // a2 = residual bytes to move
ldq_u t1, 0(a1) // t1 = first unaligned quad
110:
// get source data and merge it
// as we go
ldq_u t2, 7(a1) // t2 = second unaligned quad
extql t1, a1, t1 // extract applicable bytes from t1
extqh t2, a1, v0 // extract applicable bytes from t2
bis t1, v0, t1 // t1 = quad #1
ldq_u t3, 15(a1) // t3 = third unaligned quad
extql t2, a1, t2 // extract applicable bytes from t2
extqh t3, a1, v0 // extract applicable bytes from t3
stq t1, 0(a0) // store quad #1
bis t2, v0, t2 // t2 = quad #2
ldq_u t4, 23(a1) // t4 = fourth unaligned quad
extql t3, a1, t3 // extract applicable bytes from t3
extqh t4, a1, v0 // extract applicable bytes from t4
stq t2, 8(a0) // store quad #2
bis t3, v0, t3 // t3 = quad #3
ldq_u t5, 31(a1) // t5 = fifth unaligned quad
extql t4, a1, t4 // extract applicable bytes from t4
extqh t5, a1, v0 // extract applicable bytes from t5
stq t3, 16(a0) // store quad #3
bis t4, v0, t4 // t4 = quad #4
ldq_u a3, 39(a1) // a3 = sixth unaligned quad
extql t5, a1, t5 // extract applicable bytes from t5
extqh a3, a1, v0 // extract applicable bytes from a3
stq t4, 24(a0) // store quad #4
bis t5, v0, t5 // t5 = quad #5
ldq_u a4, 47(a1) // a4 = seventh unaligned quad
extql a3, a1, a3 // extract applicable bytes from a3
extqh a4, a1, v0 // extract applicable bytes from a4
stq t5, 32(a0) // store quad #5
bis a3, v0, a3 // a3 = quad #6
ldq_u a5, 55(a1) // a5 = eighth unaligned quad
extql a4, a1, a4 // extract applicable bytes from a4
extqh a5, a1, v0 // extract applicable bytes from a5
stq a3, 40(a0) // store quad #6
bis a4, v0, a4 // a4 = quad #7
ldq_u t1, 63(a1) // t1 = ninth unaligned = 1st of next
extql a5, a1, a5 // extract applicable bytes from a5
extqh t1, a1, v0 // extract applicable bytes from t1
stq a4, 48(a0) // store quad #7
bis a5, v0, a5 // a5 = quad #8
addq a1, 64, a1 // increment source pointer
stq a5, 56(a0) // store quad #8
addq a0, 64, a0 // increment destination pointer
subq t0, 1, t0 // decrement number of blocks
bne t0, 110b // if ne, more blocks to move
//
// Move unaligned source quads to aligned destination quads
//
120:
srl a2, 3, t0 // t0 = number of quads to move
beq t0, 140f // if eq no quads to move
and a2, 8-1, a2 // a2 = residual bytes
ldq_u t1, 0(a1) // t1 = first unaligned quad
130:
ldq_u t2, 7(a1) // t2 = second unaligned quad
addq a0, 8, a0 // increment destination pointer
extql t1, a1, t1 // extract applicable bytes from t1
extqh t2, a1, v0 // extract applicable bytes from t2
bis t1, v0, t1 // t1 = quadword of data
stq t1, -8(a0) // store data to destination
addq a1, 8, a1 // increment source pointer
subq t0, 1, t0 // decrement quads to move
bis t2, zero, t1 // t1 = first of next unaligned pair
bne t0, 130b // if ne, more quads to move
//
// Move remaining bytes to final quadword
//
140:
beq a2, 160f // if eq no more bytes to move
mov zero, t3 // t3 = position for next insertion
mov zero, t4 // assemble destination bytes here
mov a2, t0 // t0 = number of bytes to move
mov -1, t1 // t1 = bit mask
sll t0, 3, t0 // # of bytes to # of bits
srl t1, t0, t1 // clear t0 bits
sll t1, t0, t0 // move it back
not t0, t0 // complement for destination clear mask
150:
ldq_u t1, 0(a1) // get unaligned source quad
extbl t1, a1, t1 // t1 = source byte
insbl t1, t3, t1 // t1 = source byte, in position
bis t4, t1, t4 // merge in source byte
addq a1, 1, a1 // increment source pointer
subq a2, 1, a2 // decrement bytes to move
addq t3, 1, t3 // increment destination position
bne a2, 150b // more bytes to move
retry4:
ldq_l t2, 0(a0) // get last destination quadword locked
bic t2, t0, t2 // clear bytes to be copied
bis t2, t4, t2 // move bytes from source
stq_c t2, 0(a0) // store merged quadword conditional
beq t2, retry4f // if eq, retry failed interlock
//
// Finish unaligned MoveForward
//
160:
ret zero, (ra) // return
//
// Out of line branches for failed store conditional.
// Don't need to restore anything, just try again.
//
retry1f:
br retry1
retry2f:
br retry2
retry3f:
br retry3
retry4f:
br retry4
.end RtlCopyBytes
SBTTL("Zero Bytes")
//++
//
// VOID
// RtlZeroBytes (
// IN PVOID Destination,
// IN ULONG Length
// )
//
// Routine Description:
//
// This function zeros memory by first aligning the destination address to
// a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte
// blocks, followed by any remaining bytes. Unlike RtlZeroMemory the copy is
// done such that byte granularity is assured for all platforms.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the memory to zero.
//
// Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(RtlZeroBytes)
bis zero, zero, a2 // set fill pattern
br zero, RtlpFillBytes //
SBTTL("Fill Bytes")
//++
//
// VOID
// RtlFillBytes (
// IN PVOID Destination,
// IN ULONG Length,
// IN UCHAR Fill
// )
//
// Routine Description:
//
// This function fills memory by first aligning the destination address to
// a longword boundary, and then filling 32-byte blocks, followed by 4-byte
// blocks, followed by any remaining bytes. Unlike RtlFillMemory the copy is
// done such that byte granularity is assured for all platforms.
//
// Arguments:
//
// Destination (a0) - Supplies a pointer to the memory to fill.
//
// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
//
// Fill (a2) - Supplies the fill byte.
//
// N.B. The alternate entry memset expects the length and fill arguments
// to be reversed. It also returns the Destination pointer
//
// Return Value:
//
// None.
//
//--
ALTERNATE_ENTRY(RtlFillBytes)
and a2, 0xff, a2 // clear excess bits
sll a2, 8, t0 // duplicate fill byte
bis a2, t0, a2 // generate fill word
sll a2, 16, t0 // duplicate fill word
bis a2, t0, a2 // generate fill longword
sll a2, 32, t0 // duplicate fill longword
bis a2, t0, a2 // generate fill quadword
.align 3 // ensure quadword aligned target
//
// Fill memory with the pattern specified in register a2.
//
RtlpFillBytes: //
//
// Align destination to quadword
//
beq a1, 80f // anything to fill? (paranoia)
and a0, 8-1, t0 // t0 = unaligned bits
bne t0, 5f // if ne, then not quad aligned
br zero, 20f // if eq, then quad aligned
5:
bis zero, zero, t1 // t4 = destination byte zap mask
bis zero, 1, t2
sll t2, t0, t2 // t2 = next bit to set in zap mask
10:
beq a1, 15f // if eq, all bits set
bis t1, t2, t1 // set bit in zap mask
sll t2, 1, t2 // set next higher bit for zap mask
subq a1, 1, a1 // decrement bytes to fill
addq t0, 1, t0 // increment byte within quad
cmpeq t0, 8, t3 // finished the quadword?
beq t3, 10b // if eq [false], do next byte
15:
zapnot a2, t1, t2 // clear fill bytes
bic a0, 7, a3 // a3 = quadword base of destination
retry5:
ldq_l t0, 0(a3) // load destination quadword
zap t0, t1, t0 // clear destination bytes
or t0, t2, t0 // merge in fill bytes
stq_c t0, 0(a3) // store merged quadword conditional
beq t0, retry5f // if eq, retry failed interlock
addq a0, 7, a0 // move a0 to next quadword
bic a0, 7, a0 // align a0 to quadword
//
// Check for 64-byte blocks
//
20:
srl a1, 6, t0 // t0 = number of 64 byte blocks
beq t0, 40f // if eq then no 64 byte blocks
and a1, 64-1, a1 // a1 = residual bytes to fill
30:
stq a2, 0(a0) // store 64 bytes
stq a2, 8(a0) //
stq a2, 16(a0) //
stq a2, 24(a0) //
stq a2, 32(a0) //
stq a2, 40(a0) //
stq a2, 48(a0) //
stq a2, 56(a0) //
subq t0, 1, t0 // decrement blocks remaining
addq a0, 64, a0 // increment destination pointer
bne t0, 30b // more blocks to write
//
// Fill aligned quadwords
//
40:
srl a1, 3, t0 // t0 = number of quadwords
bne t0, 55f // if ne quadwords left to fill
br zero, 60f // if eq no quadwords left
55:
and a1, 8-1, a1 // a1 = residual bytes to fill
50:
stq a2, 0(a0) // store quadword
subq t0, 1, t0 // decrement quadwords remaining
addq a0, 8, a0 // next quadword
bne t0, 50b // more quadwords to write
//
// Fill bytes for last quadword
//
60:
beq a1, 80f // if eq no more bytes to fill
mov a1, t0 // t0 = number of bytes to move
mov -1, t1 // t1 = bit mask
sll t0, 3, t0 // # of bytes to # of bits
srl t1, t0, t1 // clear t0 bits
sll t1, t0, t0 // move it back
bic a2, t0, t1 // clear fill bytes not copied
not t0, t0 // complement to clear destination
retry6:
ldq_l t2, 0(a0) // get last destination quadword locked
bic t2, t0, t2 // clear bytes to be copied
bis t2, t1, t2 // move bytes from source
stq_c t2, 0(a0) // store merged quadword conditional
beq t2, retry6f // if eq, retry failed interlock
//
// Finish up
//
80:
ret zero, (ra) // return
//
// Out of line branches for failed store conditional.
// Don't need to restore anything, just try again.
//
retry5f:
br retry5
retry6f:
br retry6
.end RtlZeroBytes