diff options
author | Adam <you@example.com> | 2020-05-17 05:51:50 +0200 |
---|---|---|
committer | Adam <you@example.com> | 2020-05-17 05:51:50 +0200 |
commit | e611b132f9b8abe35b362e5870b74bce94a1e58e (patch) | |
tree | a5781d2ec0e085eeca33cf350cf878f2efea6fe5 /private/ntos/rtl/alpha/mvmem.s | |
download | NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.gz NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.bz2 NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.lz NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.xz NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.tar.zst NT4.0-e611b132f9b8abe35b362e5870b74bce94a1e58e.zip |
Diffstat (limited to 'private/ntos/rtl/alpha/mvmem.s')
-rw-r--r-- | private/ntos/rtl/alpha/mvmem.s | 1920 |
1 files changed, 1920 insertions, 0 deletions
diff --git a/private/ntos/rtl/alpha/mvmem.s b/private/ntos/rtl/alpha/mvmem.s new file mode 100644 index 000000000..c5ccc9a81 --- /dev/null +++ b/private/ntos/rtl/alpha/mvmem.s @@ -0,0 +1,1920 @@ +// TITLE("Compare, Move, Zero, and Fill Memory Support") +//++ +// +// Copyright (c) 1992 Digital Equipment Corporation +// +// Module Name: +// +// mvmem.s +// +// Abstract: +// +// This module implements functions to compare, move, zero, and fill +// blocks of memory. If the memory is aligned, then these functions +// are very efficient. +// +// N.B. These routines MUST preserve all floating state since they are +// frequently called from interrupt service routines that normally +// do not save or restore floating state. +// +// Author: +// +// Joe Notarangelo 21-May-1992 +// +// Environment: +// +// User or Kernel mode. +// +// Revision History: +// +// Monty VanderBilt 14-Feb-1996 Avoid memory loads and branch takens between +// load lock and store conditional instructions +// to conform with all alpha architecture rules. +// Monty VanderBilt 27-Feb-1996 Added RtlZeroBytes and RtlFillBytes to support +// byte granularity access when necessary. +//-- + +#include "ksalpha.h" + + SBTTL("Compare Memory") +//++ +// +// ULONG +// RtlCompareMemory ( +// IN PVOID Source1, +// IN PVOID Source2, +// IN ULONG Length +// ) +// +// Routine Description: +// +// This function compares two blocks of memory and returns the number +// of bytes that compared equal. +// +// Arguments: +// +// Source1 (a0) - Supplies a pointer to the first block of memory to +// compare. +// +// Source2 (a1) - Supplies a pointer to the second block of memory to +// compare. +// +// Length (a2) - Supplies the length, in bytes, of the memory to be +// compared. +// +// Return Value: +// +// The number of bytes that compared equal is returned as the function +// value. If all bytes compared equal, then the length of the orginal +// block of memory is returned. +// +//-- + + + LEAF_ENTRY(RtlCompareMemory) + + bis a2, zero, v0 // save length of comparison + beq a2, 90f // (JAE) quit if nothing to compare + xor a0, a1, t0 // check for compatible alignment + and t0, 0x7, t0 // low bits only + bne t0, CompareUnaligned // if ne, incompatible alignment + +// +// Compare memory aligned +// + +CompareAligned: // + +// +// compare memory until sources are aligned +// + and a0, 0x7, t0 // get low bits + bne t0, 10f // if ne, sources not aligned yet + br zero, 30f // already aligned, predicted + + +10: + ldq_u t1, 0(a0) // get unaligned quad at source 1 + ldq_u t2, 0(a1) // get unaligned quad at source 2 + +20: + extbl t1, t0, t4 // byte at t0 in source 1 quad + extbl t2, t0, t5 // byte at t0 in source 2 quad + xor t4, t5, t3 // t1 = t2 ? + bne t3, 110f // not equal, miscompare + subq a2, 1, a2 // decrement bytes to compare + beq a2, 90f // if eq, compare success + addq t0, 1, t0 // increment pointer within quad + cmpeq t0, 8, t3 // t0 = 8?, if so first quadword done + beq t3, 20b // continue while t0 < 8 + + + addq a0, 8, a0 // increment to next quadword + addq a1, 8, a1 // increment source 2 to next also + bic a0, 7, a0 // align source 1 quadword + bic a1, 7, a1 // align source 2 quadword + + +// +// aligned block compare, compare blocks of 64 bytes +// + +30: + srl a2, 6, t0 // t0 = number of 64 byte blocks + beq t0, 50f // if eq, no 64 byte blocks + +// +// N.B. loads from each of the sources were separated in case these +// blocks are fighting for the cache +// + .set noat +40: + ldq t1, 0(a0) // t1 = source 1, quad 0 + ldq t2, 8(a0) // t2 = source 1, quad 1 + ldq t3, 16(a0) // t3 = source 1, quad 2 + addq a1, 64, a1 // increment source 2 pointer + ldq t4, 24(a0) // t4 = source 1, quad 3 + + ldq t5, -64(a1) // t5 = source 2, quad 0 + ldq a4, -56(a1) // a4 = source 2, quad 1 + ldq a5, -48(a1) // a5 = source 2, quad 2 + xor t1, t5, $at // quad 0 match? + bne $at, 200f // if ne[false], miscompare + ldq t5, -40(a1) // t5 = source 2, quad 3 + ldq t1, 32(a0) // t1 = source 1, quad 4 + xor t2, a4, $at // quad 1 match? + bne $at, 122f // if ne[false], miscompare + ldq t2, 40(a0) // t2 = source 1, quad 5 + xor t3, a5, $at // quad 2 match? + bne $at, 124f // if ne[false], miscompare + ldq t3, 48(a0) // t3 = source 1, quad 6 + xor t4, t5, $at // quad 3 match? + bne $at, 126f // if ne[false], miscompare + ldq t4, 56(a0) // t4 = source 1, quad 7 + + ldq t5, -32(a1) // t5 = source 2, quad 4 + addq a0, 64, a0 // increment source 1 pointer + ldq a4, -24(a1) // a4 = source 2, quad 5 + subq t0, 1, t0 // decrement blocks to compare + ldq a5, -16(a1) // a5 = source 2, quad 6 + xor t1, t5, $at // quad 4 match? + bne $at, 130f // if ne[false], miscompare + ldq t5, -8(a1) // t5 = source 2, quad 7 + xor t2, a4, $at // quad 5 match? + bne $at, 132f // if ne[false], miscompare + xor t3, a5, $at // quad 6 match? + bne $at, 134f // if ne[false], miscompare + xor t4, t5, $at // quad 7 match? + bne $at, 136f // if ne[false], miscompare + subq a2, 64, a2 // decrement bytes to compare + bne t0, 40b // if ne, more blocks to compare + .set at + + +// +// Compare quadwords +// + +50: + srl a2, 3, t0 // t0 = number of quadwords to compare + beq t0, 70f // if eq, no quadwords to compare + + .set noat +60: + ldq t1, 0(a0) // t1 = quad from source 1 + lda a0, 8(a0) // increment source 1 pointer + ldq t2, 0(a1) // t2 = quad from source 2 + lda a1, 8(a1) // increment source 2 pointer + xor t1, t2, $at // are quadwords equal? + bne $at, 200f // if ne, miscompare + subq t0, 1, t0 // decrement quads to compare + subq a2, 8, a2 // decrement bytes to compare + bne t0, 60b // if ne, more quads to compare + + .set at + +// +// Compare bytes in last quadword +// + +// a2 = number of bytes to compare, less than 8, greater than zero +// a0, a1, quad-aligned to last quadword + + beq a2, 80f // if eq, all bytes compared + + .set noat +70: + ldq t1, 0(a0) // t1 = quad at source 1 + ldq t2, 0(a1) // t2 = quad at source 2 + bis zero, 0xff, t0 // zap mask + sll t0, a2, t0 // + zap t1, t0, t1 // zero bytes not compared + zap t2, t0, t2 // same for source 2 + xor t1, t2, $at // compare quadwords + bne $at, 200f // if ne, miscompare + + .set at +// +// Successful compare +// v0 already contains full length +// + +80: + ret zero, (ra) // return + + +// +// Sources have incompatible alignment +// +CompareUnaligned: + + +// +// Compare until source 1 (a0) is aligned +// + + and a0, 0x7, t0 // get byte position of pointer + beq t0, 30f // if eq, already aligned + + ldq_u t1, 0(a0) // get unaligned quad at a0 + +10: + ldq_u t2, 0(a1) // get unaligned quad at a1 + extbl t1, t0, t4 // get byte to compare from source 1 + extbl t2, a1, t2 // get byte to compare from source 2 + xor t4, t2, t3 // do bytes match? + bne t3, 110f // if ne, miscompare + subq a2, 1, a2 // decrement bytes to compare + beq a2, 90f // (JAE) quit if nothing left to compare + addq t0, 1, t0 // increment byte within source 1 + addq a1, 1, a1 // increment source 2 pointer + cmpeq t0, 8, t3 // finished with source 1 quad? + beq t3, 10b // if eq[false], more to compare + + addq a0, 7, a0 // point to next source 1 quad + bic a0, 7, a0 // align to quadword + + +// +// Compare 64-byte blocks +// + +30: + srl a2, 6, t0 // t0 = number of blocks to compare + beq t0, 50f // if eq, no blocks to move + + ldq_u t1, 0(a1) // get source 2 unaligned quad 1 + + .set noat +40: + ldq_u t2, 7(a1) // get source 2 unaligned quad 2 + addq a0, 64, a0 // increment source 1 pointer + ldq_u t3, 15(a1) // get source 2 unaligned quad 3 + extql t1, a1, t1 // bytes from unaligned quad 1 + extqh t2, a1, $at // bytes from unaligned quad 2 + ldq_u t4, 23(a1) // get source 2 unaligned quad 4 + bis t1, $at, t1 // t1 = quadword 1 (source 2) + ldq_u t5, 31(a1) // get source 2 unaligned quad 5 + extql t2, a1, t2 // bytes from unaligned quad 2 + extqh t3, a1, $at // bytes from unaligned quad 3 + ldq a3, -64(a0) // a3 = quadword 1 (source 1) + bis t2, $at, t2 // t2 = quadword 2 (source 2) + ldq a4, -56(a0) // a4 = quadword 2 (source 1) + extql t3, a1, t3 // bytes from unaligned quad 3 + extqh t4, a1, $at // bytes from unaligned quad 4 + ldq a5, -48(a0) // a5 = quadword 3 (source 1) + bis t3, $at, t3 // t3 = quadword 3 (source 2) + extql t4, a1, t4 // bytes from unaligned quad 4 + extqh t5, a1, $at // bytes from unaligned quad 5 + subq t0, 1, t0 // decrement blocks to compare + bis t4, $at, t4 // t4 = quadword 4 (source 2) + + xor t1, a3, $at // match on quadword 1? + ldq a3, -40(a0) // a3 = quadword 4 (source 1) + bne $at, 200f // if ne, miscompare quad 1 + xor t2, a4, $at // match on quadword 2? + ldq_u t2, 39(a1) // get source 2 unaligned quad 6 + bne $at, 122f // if ne, miscompare quad 2 + xor t3, a5, $at // match on quadword 3? + ldq_u t3, 47(a1) // get source 2 unaligned quad 7 + bne $at, 124f // if ne, miscompare quad 3 + xor t4, a3, $at // match on quadword 4? + ldq_u t4, 55(a1) // get source 2 unaligned quad 8 + bne $at, 126f // if ne, miscompare quad 4 + ldq_u t1, 63(a1) // get source 2 unaligned quad 9 + + ldq a3, -32(a0) // a3 = quadword 5 (source 1) + extql t5, a1, t5 // bytes from unaligned quad 5 + extqh t2, a1, $at // bytes from unaligned quad 6 + ldq a4, -24(a0) // a4 = quadword 6 (source 1) + ldq a5, -16(a0) // a5 = quadword 7 (source 1) + bis t5, $at, t5 // t5 = quadword 5 (source 2) + + xor t5, a3, $at // match on quadword 5? + ldq a3, -8(a0) // a3 = quadword 8 (source 1) + bne $at, 130f // if ne, miscompare quad 5 + extql t2, a1, t2 // bytes from unaligned quad 6 + extqh t3, a1, $at // bytes from unaligned quad 7 + extql t3, a1, t3 // bytes from unaligned quad 7 + bis t2, $at, t2 // t2 = quadword 6 (source 2) + xor t2, a4, $at // match on quadword 6? + bne $at, 132f // if ne, miscompare quad 6 + extqh t4, a1, $at // bytes from unaligned quad 8 + extql t4, a1, t4 // bytes from unaligned quad 8 + bis t3, $at, t3 // t3 = quadword 7 (source 2) + xor t3, a5, $at // match on quadword 7? + bne $at, 134f // if ne, miscompare quad 7 + extqh t1, a1, $at // bytes from unaligned quad 9 + addq a1, 64, a1 // increment source 2 pointer + bis t4, $at, t4 // t4 = quadword 8 (source 2) + xor t4, a3, $at // match on quadword 8? + bne $at, 136f // if ne, miscompare quad 8 + subq a2, 64, a2 // decrement number of bytes to compare + bne t0, 40b // if ne, more blocks to compare + + .set at + +// +// Compare quadwords +// + + +50: + srl a2, 3, t0 // t0 = number of quads to compare + beq t0, 70f // if eq, no quads to compare + ldq_u t1, 0(a1) // get unaligned quad 1 (source 2) + + .set noat +60: + ldq_u t2, 7(a1) // get unaligned quad 2 (source 2) + ldq t3, 0(a0) // t3 = quadword 1 (source 1) + extql t1, a1, t1 // get bytes from unaligned quad 1 + extqh t2, a1, $at // get bytes from unaligned quad 2 + addq a1, 8, a1 // increment source 2 pointer + bis t1, $at, t1 // t1 = quadword 1 (source 2) + xor t1, t3, $at // match on quadword? + bne $at, 200f // if ne, miscompare + subq t0, 1, t0 // decrement quadwords to compare + addq a0, 8, a0 // increment source 1 pointer + subq a2, 8, a2 // decrement bytes to compare + bis t2, zero, t1 // save low quadword for next loop + bne t0, 60b // if ne, more quads to compare + + .set at + +// +// Compare bytes for final quadword +// + +70: + beq a2, 90f // if eq, comparison complete + + ldq t1, 0(a0) // get quadword from source 1 + bis zero, zero, t0 // t0 = byte position to compare + + .set noat +80: + ldq_u t2, 0(a1) // get unaligned quad from source 2 + extbl t1, t0, t3 // t3 = byte from source 1 + extbl t2, a1, t2 // t2 = byte from source 2 + xor t3, t2, $at // match on byte? + bne $at, 100f // if ne, miscompare on byte + addq t0, 1, t0 // increment byte position + addq a1, 1, a1 // increment source 2 pointer + subq a2, 1, a2 // decrement bytes to compare + bne a2, 80b // if ne, more bytes to compare + + .set at +// +// Successful full comparison +// + +90: + ret zero, (ra) // return, v0 already set + + +// +// Miscompare on last quadword +// + +100: + subq v0, a2, v0 // subtract bytes not compared + ret zero, (ra) // return + +// +// Miscompare on first quadword, unaligned case +// +// v0 = total bytes to compare +// a2 = bytes remaining to compare +// + +110: + subq v0, a2, v0 // bytes compared successfully + ret zero, (ra) // return + +// +// Miscompare on 64-byte block compare +// + +122: + subq a2, 8, a2 // miscompare on quad 2 + br zero, 200f // finish in common code + +124: + subq a2, 16, a2 // miscompare on quad 3 + br zero, 200f // finish in common code + +126: + subq a2, 24, a2 // miscompare on quad 4 + br zero, 200f // finish in common code + +130: + subq a2, 32, a2 // miscompare on quad 5 + br zero, 200f // finish in common code + +132: + subq a2, 40, a2 // miscompare on quad 6 + br zero, 200f // finish in common code + +134: + subq a2, 48, a2 // miscompare on quad 7 + br zero, 200f // finish in common code + +136: + subq a2, 56, a2 // miscompare on quad 8 + br zero, 200f // finish in common code + +// +// Miscompare, determine number of bytes that successfully compared +// $at = xor of relevant quads from sources, must be non-zero +// a2 = number of bytes left to compare +// + .set noat +200: + cmpbge zero, $at, $at // $at = mask of non-zero bytes + + // + // look for the first bit cleared in $at, this is the + // number of the first byte which differed + // + bis zero, zero, t0 // bit position to look for clear + +210: + blbc $at, 220f // if low clear, found difference + srl $at, 1, $at // check next bit + addq t0, 1, t0 // count bit position checked + br zero, 210b + +220: + subq v0, a2, v0 // subtract bytes yet to compare + addq v0, t0, v0 // add bytes that matched on last quad + + ret zero, (ra) + + .set at + + .end RtlCompareMemory + + + + SBTTL("Move Memory") +//++ +// +// VOID +// RtlMoveMemory ( +// IN PVOID Destination, +// IN PVOID Source, +// IN ULONG Length +// ) +// +// Routine Description: +// +// This function moves memory either forward or backward, aligned or +// unaligned, in 64-byte blocks, followed by 8-byte blocks, followed +// by any remaining bytes. +// +// Arguments: +// +// Destination (a0) - Supplies a pointer to the destination address of +// the move operation. +// +// Source (a1) - Supplies a pointer to the source address of the move +// operation. +// +// Length (a2) - Supplies the length, in bytes, of the memory to be moved. +// +// Return Value: +// +// None. +// +//-- + + LEAF_ENTRY(RtlMoveMemory) + + beq a2, 80f // if eq, no bytes to move +// +// If the source address is less than the destination address and source +// address plus the length of the move is greater than the destination +// address, then the source and destination overlap such that the move +// must be performed backwards. +// + + cmpult a0, a1, t0 // is destination less than source + bne t0, MoveForward // if eq [true] no overlap possible + addq a1, a2, t0 // compute source ending address + cmpult t0, a0, t1 // is source end less than dest. + beq t1, MoveBackward // if eq [false], overlap + +// +// Move memory forward aligned and unaligned. +// + +MoveForward: // + xor a0, a1, t0 // compare alignment bits + and t0, 0x7, t0 // isloate alignment comparison + bne t0, MoveForwardUnaligned // if ne, incompatible alignment + +// +// Move memory forward aligned. +// + +MoveForwardAligned: // + +// +// Move bytes until source and destination are quadword aligned +// + + and a0, 0x7, t0 // t0 = unaligned bits + bne t0, 5f // if ne, not quad aligned + br zero, 20f // predicted taken + +5: + ldq_u t2, 0(a0) // get unaligned quad from dest. + ldq_u t1, 0(a1) // get unaligned quadword from source +10: + beq a2, 15f // if eq, all bytes moved + extbl t1, t0, t3 // t3 = byte from source + insbl t3, t0, t3 // t3 = byte from source, in position + mskbl t2, t0, t2 // clear position in dest. quad + bis t2, t3, t2 // merge in byte from source + subq a2, 1, a2 // decrement bytes to move + addq t0, 1, t0 // increment byte within quad + cmpeq t0, 8, t3 // finished the quadword? + beq t3, 10b // if eq [false], do next byte +15: + stq_u t2, 0(a0) // store merged destination bytes + + addq a0, 7, a0 // move to next quadword + bic a0, 7, a0 // aligned quadword + + addq a1, 7, a1 // move to next quadword + bic a1, 7, a1 // aligned quadword + +// +// Check for 64-byte block moves +// + +20: + srl a2, 6, t0 // t0 = number of 64 byte blocks + beq t0, 40f // if eq no blocks to move + and a2, 64-1, a2 // a2 = residual bytes + +30: + ldq t1, 0(a1) // load 64 bytes from source + addq a0, 64, a0 // increment destination pointer + ldq v0, 56(a1) // + ldq a3, 32(a1) // + stq t1, -64(a0) // write to destination + ldq t2, 8(a1) // into volatile registers + ldq t3, 16(a1) // + ldq t4, 24(a1) // + subq t0, 1, t0 // decrement number of blocks + stq t2, -56(a0) // + ldq a4, 40(a1) // + stq t3, -48(a0) // + ldq a5, 48(a1) // + stq t4, -40(a0) // + addq a1, 64, a1 // increment source pointer + stq a3, -32(a0) // + stq a4, -24(a0) // + stq a5, -16(a0) // + stq v0, -8(a0) // + bne t0, 30b // if ne, more blocks to copy + +// +// Copy quadwords +// + +40: + srl a2, 3, t0 // t0 = number of quadwords to move + beq t0, 60f // if eq no quadwords to move + and a2, 8-1, a2 // a2 = residual bytes + +50: + ldq t1, 0(a1) // load quadword from source + addq a1, 8, a1 // increment source pointer + stq t1, 0(a0) // store quadword to destination + addq a0, 8, a0 // increment destination pointer + subq t0, 1, t0 // decrement number of quadwords + bne t0, 50b // if ne, more quadwords to move + +// +// Move final residual bytes +// + +60: + beq a2, 80f // if eq, no more bytes to move + ldq t1, 0(a1) // get last source quadword + ldq t2, 0(a0) // get last dest. quadword + bis zero, zero, t0 // t0 = next byte number to move + +70: + extbl t1, t0, t3 // extract byte from source + insbl t3, t0, t3 // t3 = source byte, in position + mskbl t2, t0, t2 // clear byte position for dest. + bis t2, t3, t2 // merge in source byte + addq t0, 1, t0 // increment byte position + subq a2, 1, a2 // decrement bytes to move + bne a2, 70b // if ne => more bytes to move + + stq t2, 0(a0) // store merged data + +// +// Finish aligned MoveForward +// + +80: + ret zero, (ra) // return + + + +// +// Move memory forward unaligned. +// + +MoveForwardUnaligned: // + + +// +// Move bytes until the destination is aligned +// + + and a0, 0x7, t0 // t0 = unaligned bits + beq t0, 100f // if eq, destination quad aligned + + ldq_u t2, 0(a0) // get unaligned quad from dest + +90: + beq a2, 95f // if eq no more bytes to move + ldq_u t1, 0(a1) // get unaligned quad from source + extbl t1, a1, t1 // extract source byte + insbl t1, t0, t1 // t1 = source byte, in position + mskbl t2, t0, t2 // clear byte position in dest. + bis t2, t1, t2 // merge in source byte + addq t0, 1, t0 // increment byte position + addq a1, 1, a1 // increment source pointer + subq a2, 1, a2 // decrement bytes to move + cmpeq t0, 8, t3 // t0 = 8? => quad finished + beq t3, 90b // if eq [false], more bytes to move +95: + stq_u t2, 0(a0) // store merged quadword + addq a0, 7, a0 // increment to next quad + bic a0, 7, a0 // align next quadword + +// +// Check for 64-byte blocks to move +// + +100: + srl a2, 6, t0 // t0 = number of blocks to move + beq t0, 120f // if eq no blocks to move + and a2, 64-1, a2 // a2 = residual bytes to move + + + ldq_u t1, 0(a1) // t1 = first unaligned quad + +110: + // get source data and merge it + // as we go + ldq_u t2, 7(a1) // t2 = second unaligned quad + extql t1, a1, t1 // extract applicable bytes from t1 + extqh t2, a1, v0 // extract applicable bytes from t2 + bis t1, v0, t1 // t1 = quad #1 + ldq_u t3, 15(a1) // t3 = third unaligned quad + extql t2, a1, t2 // extract applicable bytes from t2 + extqh t3, a1, v0 // extract applicable bytes from t3 + stq t1, 0(a0) // store quad #1 + bis t2, v0, t2 // t2 = quad #2 + ldq_u t4, 23(a1) // t4 = fourth unaligned quad + extql t3, a1, t3 // extract applicable bytes from t3 + extqh t4, a1, v0 // extract applicable bytes from t4 + stq t2, 8(a0) // store quad #2 + bis t3, v0, t3 // t3 = quad #3 + ldq_u t5, 31(a1) // t5 = fifth unaligned quad + extql t4, a1, t4 // extract applicable bytes from t4 + extqh t5, a1, v0 // extract applicable bytes from t5 + stq t3, 16(a0) // store quad #3 + bis t4, v0, t4 // t4 = quad #4 + ldq_u a3, 39(a1) // a3 = sixth unaligned quad + extql t5, a1, t5 // extract applicable bytes from t5 + extqh a3, a1, v0 // extract applicable bytes from a3 + stq t4, 24(a0) // store quad #4 + bis t5, v0, t5 // t5 = quad #5 + ldq_u a4, 47(a1) // a4 = seventh unaligned quad + extql a3, a1, a3 // extract applicable bytes from a3 + extqh a4, a1, v0 // extract applicable bytes from a4 + stq t5, 32(a0) // store quad #5 + bis a3, v0, a3 // a3 = quad #6 + ldq_u a5, 55(a1) // a5 = eighth unaligned quad + extql a4, a1, a4 // extract applicable bytes from a4 + extqh a5, a1, v0 // extract applicable bytes from a5 + stq a3, 40(a0) // store quad #6 + bis a4, v0, a4 // a4 = quad #7 + ldq_u t1, 63(a1) // t1 = ninth unaligned = 1st of next + extql a5, a1, a5 // extract applicable bytes from a5 + extqh t1, a1, v0 // extract applicable bytes from t1 + stq a4, 48(a0) // store quad #7 + bis a5, v0, a5 // a5 = quad #8 + addq a1, 64, a1 // increment source pointer + stq a5, 56(a0) // store quad #8 + addq a0, 64, a0 // increment destination pointer + subq t0, 1, t0 // decrement number of blocks + bne t0, 110b // if ne, more blocks to move + +// +// Move unaligned source quads to aligned destination quads +// + +120: + srl a2, 3, t0 // t0 = number of quads to move + beq t0, 140f // if eq no quads to move + and a2, 8-1, a2 // a2 = residual bytes + + + ldq_u t1, 0(a1) // t1 = first unaligned quad +130: + ldq_u t2, 7(a1) // t2 = second unaligned quad + addq a0, 8, a0 // increment destination pointer + extql t1, a1, t1 // extract applicable bytes from t1 + extqh t2, a1, v0 // extract applicable bytes from t2 + bis t1, v0, t1 // t1 = quadword of data + stq t1, -8(a0) // store data to destination + addq a1, 8, a1 // increment source pointer + subq t0, 1, t0 // decrement quads to move + bis t2, zero, t1 // t1 = first of next unaligned pair + bne t0, 130b // if ne, more quads to move + +// +// Move remaining bytes to final quadword +// + + +140: + beq a2, 160f // if eq no more bytes to move + ldq t2, 0(a0) // t2 = destination quadword + bis zero, zero, t3 // t3 = position for next insertion + +150: + ldq_u t1, 0(a1) // get unaligned source quad + extbl t1, a1, t1 // t1 = source byte + insbl t1, t3, t1 // t1 = source byte, in position + mskbl t2, t3, t2 // clear byte in destination + bis t2, t1, t2 // merge in source byte + addq a1, 1, a1 // increment source pointer + subq a2, 1, a2 // decrement bytes to move + addq t3, 1, t3 // increment destination position + bne a2, 150b // more bytes to move + + stq t2, 0(a0) // store merged data + +// +// Finish unaligned MoveForward +// + +160: + ret zero, (ra) // return + + +// +// Move memory backward. +// + +MoveBackward: // + + addq a0, a2, a0 // compute ending destination address + addq a1, a2, a1 // compute ending source address + subq a0, 1, a0 // point to last destination byte + subq a1, 1, a1 // point to last source byte + xor a0, a1, t0 // compare alignment bits + and t0, 0x7, t0 // isolate alignment comparison + bne t0, MoveBackwardUnaligned // if ne, incompatible alignment + +// +// Move memory backward aligned. +// + +MoveBackwardAligned: // + +// +// Move bytes until source and destination are quadword aligned +// + + and a0, 0x7, t0 // t0 = unaligned bits + cmpeq t0, 7, t1 // last byte position 7? + beq t1, 5f // if eq [false], not quad aligned + subq a0, 7, a0 // point to beginning of last quad + subq a1, 7, a1 // point to beginning of last quad + br zero, 30f // predicted taken + +5: + ldq_u t1, 0(a0) // get unaligned quad from dest. + ldq_u t2, 0(a1) // get unaligned quad from source + +10: + beq a2, 20f // if eq, all bytes moved + extbl t2, t0, t3 // t3 = byte from source + insbl t3, t0, t3 // t3 = byte from source, in position + mskbl t1, t0, t1 // clear position in destination + bis t1, t3, t1 // merge in byte from source + subq a2, 1, a2 // decrement bytes to move + subq t0, 1, t0 // decrement byte within quadword + cmplt t0, zero, t3 // finished the quadword? + beq t3, 10b // if eq [false], do next byte + +20: + stq_u t1, 0(a0) // store merged destination bytes + + subq a0, 8, a0 // move to previous quadword + bic a0, 7, a0 // aligned quadword + + subq a1, 8, a1 // move to previous quadword + bic a1, 7, a1 // aligned quadword + +// +// Check for 64-byte block moves +// + +30: + + srl a2, 6, t0 // t0 = number of 64 byte blocks + beq t0, 50f // if eq, no blocks to move + and a2, 64-1, a2 // a2 = residual bytes + +40: + ldq t1, 0(a1) // load 64 bytes from source into + subq a0, 64, a0 // decrement destination pointer + ldq v0, -56(a1) // + ldq a3, -32(a1) // + stq t1, 64(a0) // write to destination + ldq t2, -8(a1) // into volatile registers + ldq a5, -48(a1) // + ldq a4, -40(a1) // + stq t2, 56(a0) // + ldq t3, -16(a1) // + ldq t4, -24(a1) // + subq a1, 64, a1 // decrement source pointer + stq t3, 48(a0) // + stq t4, 40(a0) // + stq a3, 32(a0) // + subq t0, 1, t0 // decrement number of blocks + stq a4, 24(a0) // + stq a5, 16(a0) // + stq v0, 8(a0) // + bne t0, 40b // if ne, more blocks to copy + +// +// Copy quadwords +// + +50: + srl a2, 3, t0 // t0 = number of quadwords to move + beq t0, 70f // if eq no quadwords to move + and a2, 8-1, a2 // a2 = residual bytes + +60: + ldq t1, 0(a1) // load quadword from source + subq a1, 8, a1 // decrement source pointer + stq t1, 0(a0) // store quadword to destination + subq a0, 8, a0 // decrement destination pointer + subq t0, 1, t0 // decrement quadwords to move + bne t0, 60b // if ne, more quadwords to move + +// +// Move final residual bytes +// + +70: + beq a2, 90f // if eq, no more bytes to move + ldq t1, 0(a1) // get last source quadword + ldq t2, 0(a0) // get last destination quadword + bis zero, 7, t0 // t0 = next byte number to move + +80: + extbl t1, t0, t3 // extract byte from source + insbl t3, t0, t3 // t3 = source byte, in position + mskbl t2, t0, t2 // clear byte position for dest. + bis t2, t3, t2 // merge in source byte + subq t0, 1, t0 // decrement byte position + subq a2, 1, a2 // decrement bytes to move + bne a2, 80b // if ne, more bytes to move + + stq t2, 0(a0) // write destination data +// +// Finish aligned MoveBackward +// + +90: + + ret zero, (ra) // return + + +// +// Move memory backward unaligned. +// + +MoveBackwardUnaligned: // + + +// +// Move bytes until the destination is aligned +// + + and a0, 0x7, t0 // t0 = unaligned bits + cmpeq t0, 7, t1 // last byte of a quadword + beq t1, 95f // if eq[false], not aligned + subq a0, 7, a0 // align pointer to beginning of quad + br zero, 120f // + +95: + ldq_u t2, 0(a0) // get unaligned quad from dest. + +100: + beq a2, 110f // if eq, no more bytes to move + ldq_u t1, 0(a1) // get unaligned quad from source + extbl t1, a1, t1 // extract source byte + insbl t1, t0, t1 // t1 = source byte in position + mskbl t2, t0, t2 // clear byte position in dest. + bis t2, t1, t2 // merge source byte + subq t0, 1, t0 // decrement byte position + subq a1, 1, a1 // decrement source pointer + subq a2, 1, a2 // decrement number of bytes to move + cmplt t0, zero, t3 // t0 < 0? => quad finished + beq t3, 100b // if eq [false], more bytes to move + +110: + stq_u t2, 0(a0) // store merged quadword + + subq a0, 8, a0 // decrement dest. to previous quad + bic a0, 7, a0 // align previous quadword + +// +// Check for 64-byte blocks to move +// + +120: + + srl a2, 6, t0 // t0 = number of blocks to move + subq a1, 7, a1 // point to beginning of last quad + beq t0, 140f // if eq no blocks to move + and a2, 64-1, a2 // a2 = residual bytes to move + + ldq_u t1, 7(a1) // t1 = first unaligned quad + +130: + // get source data and merge it + // as we go + ldq_u t2, 0(a1) // t2 = second unaligned quad + extqh t1, a1, t1 // extract applicable bytes from t1 + extql t2, a1, v0 // extract applicable bytes from t2 + bis t1, v0, t1 // t1 = quad #1 + ldq_u t3, -8(a1) // t3 = third unaligned quad + extqh t2, a1, t2 // extract applicable bytes from t2 + extql t3, a1, v0 // extract applicable bytes from t3 + stq t1, 0(a0) // store quad #1 + bis t2, v0, t2 // t2 = quad #2 + ldq_u t4, -16(a1) // t4 = fourth unaligned quad + extqh t3, a1, t3 // extract applicable bytes from t3 + extql t4, a1, v0 // extract applicable bytes from t4 + stq t2, -8(a0) // store quad #2 + bis t3, v0, t3 // t3 = quad #3 + ldq_u t5, -24(a1) // t5 = fifth unaligned quad + extqh t4, a1, t4 // extract applicable bytes from t4 + extql t5, a1, v0 // extract applicable bytes from t5 + stq t3, -16(a0) // store quad #3 + bis t4, v0, t4 // t4 = quad #4 + ldq_u a3, -32(a1) // a3 = sixth unaligned quad + extqh t5, a1, t5 // extract applicable bytes from t5 + extql a3, a1, v0 // extract applicable bytes from a3 + stq t4, -24(a0) // store quad #4 + bis t5, v0, t5 // t5 = quad #5 + ldq_u a4, -40(a1) // a4 = seventh unaligned quad + extqh a3, a1, a3 // extract applicable bytes from a3 + extql a4, a1, v0 // extract applicable bytes from a4 + stq t5, -32(a0) // store quad #5 + bis a3, v0, a3 // a3 = quad #6 + ldq_u a5, -48(a1) // a5 = eighth unaligned quad + extqh a4, a1, a4 // extract applicable bytes from a4 + extql a5, a1, v0 // extract applicable bytes from a5 + stq a3, -40(a0) // store quad #6 + bis a4, v0, a4 // a4 = quad #7 + ldq_u t1, -56(a1) // t1 = ninth unaligned = 1st of next + extqh a5, a1, a5 // extract applicable bytes from a5 + extql t1, a1, v0 // extract applicable bytes from t1 + stq a4, -48(a0) // store quad #7 + bis a5, v0, a5 // a5 = quad #8 + subq a1, 64, a1 // increment source pointer + stq a5, -56(a0) // store quad #8 + subq a0, 64, a0 // increment destination pointer + subq t0, 1, t0 // decrement number of blocks + bne t0, 130b // if ne, more blocks to move + + +// +// Move unaligned source quads to aligned destination quads +// + +140: + srl a2, 3, t0 // t0 = number of quads to move + beq t0, 160f // if eq no quads to move + and a2, 8-1, a2 // a2 = residual bytes + + ldq_u t1, 7(a1) // t1 = first unaligned quad + +150: + ldq_u t2, 0(a1) // t2 = second unaligned quad + subq a0, 8, a0 // decrement destination pointer + extqh t1, a1, t1 // extract applicable bytes from t1 + extql t2, a1, v0 // extract applicable bytes from t2 + bis t1, v0, t1 // t1 = quadword of data + stq t1, 8(a0) // store data to destination + subq a1, 8, a1 // decrement source pointer + subq t0, 1, t0 // decrement quads to move + bis t2, zero, t1 // t1 = first of next unaligned pair + bne t0, 150b // if ne, more quads to move + +// +// Move remaining bytes to final quadword +// + +160: + beq a2, 180f // if eq, no more bytes to move + ldq t2, 0(a0) // t2 = destination quadword + bis zero, 7, t0 // t0 = position for next insertion + +170: + subq a1, 1, a1 // decrement source pointer + ldq_u t1, 8(a1) // get unaligned source quad + extbl t1, a1, t1 // t1 = source byte + insbl t1, t0, t1 // t1 = source byte, in position + mskbl t2, t0, t2 // clear byte position + bis t2, t1, t2 // merge in source byte + subq t0, 1, t0 // decrement byte position for dest. + subq a2, 1, a2 // decrement bytes to move + bne a2, 170b // if ne, more bytes to move + + stq t2, 0(a0) // + +// +// Finish unaligned MoveBackward +// + +180: + ret zero, (ra) // return + + .end RtlMoveMemory + + SBTTL("Zero Memory") +//++ +// +// VOID +// RtlZeroMemory ( +// IN PVOID Destination, +// IN ULONG Length +// ) +// +// Routine Description: +// +// This function zeros memory by first aligning the destination address to +// a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte +// blocks, followed by any remaining bytes. +// +// Arguments: +// +// Destination (a0) - Supplies a pointer to the memory to zero. +// +// Length (a1) - Supplies the length, in bytes, of the memory to be zeroed. +// +// Return Value: +// +// None. +// +//-- + + LEAF_ENTRY(RtlZeroMemory) + + bis zero, zero, a2 // set fill pattern + br zero, RtlpFillMemory // + + + SBTTL("Fill Memory") +//++ +// +// VOID +// RtlFillMemory ( +// IN PVOID Destination, +// IN ULONG Length, +// IN UCHAR Fill +// ) +// +// Routine Description: +// +// This function fills memory by first aligning the destination address to +// a longword boundary, and then filling 32-byte blocks, followed by 4-byte +// blocks, followed by any remaining bytes. +// +// Arguments: +// +// Destination (a0) - Supplies a pointer to the memory to fill. +// +// Length (a1) - Supplies the length, in bytes, of the memory to be filled. +// +// Fill (a2) - Supplies the fill byte. +// +// N.B. The alternate entry memset expects the length and fill arguments +// to be reversed. It also returns the Destination pointer +// +// Return Value: +// +// None. +// +//-- + + ALTERNATE_ENTRY(memset) + + bis a0, zero, v0 // set return value + bis a1, zero, a3 // swap length and fill arguments + bis a2, zero, a1 // + bis a3, zero, a2 // + + ALTERNATE_ENTRY(RtlFillMemory) + + and a2, 0xff, a2 // clear excess bits + sll a2, 8, t0 // duplicate fill byte + bis a2, t0, a2 // generate fill word + sll a2, 16, t0 // duplicate fill word + bis a2, t0, a2 // generate fill longword + sll a2, 32, t0 // duplicate fill longword + bis a2, t0, a2 // generate fill quadword + +.align 3 // ensure quadword aligned target +// +// Fill memory with the pattern specified in register a2. +// + +RtlpFillMemory: // + +// +// Align destination to quadword +// + + beq a1, 80f // anything to fill? (paranoia) + and a0, 8-1, t0 // t0 = unaligned bits + bne t0, 5f // if ne, then not quad aligned + br zero, 20f // if eq, then quad aligned + +5: + ldq_u t1, 0(a0) // get unaligned quadword + // for first group of bytes +10: + beq a1, 15f // if eq no more bytes to fill + insbl a2, t0, t2 // get fill byte into position + mskbl t1, t0, t1 // clear byte for fill + bis t1, t2, t1 // put in fill byte + addq t0, 1, t0 // increment to next byte position + subq a1, 1, a1 // decrement bytes to fill + cmpeq t0, 8, t2 // t0 = 8? + beq t2, 10b // if eq [false] more bytes to do + +15: + stq_u t1, 0(a0) // store modified bytes + addq a0, 7, a0 // move a0 to next quadword + bic a0, 7, a0 // align a0 to quadword + +// +// Check for 64-byte blocks +// + +20: + srl a1, 6, t0 // t0 = number of 64 byte blocks + beq t0, 40f // if eq then no 64 byte blocks + and a1, 64-1, a1 // a1 = residual bytes to fill + +30: + stq a2, 0(a0) // store 64 bytes + stq a2, 8(a0) // + stq a2, 16(a0) // + stq a2, 24(a0) // + stq a2, 32(a0) // + stq a2, 40(a0) // + stq a2, 48(a0) // + stq a2, 56(a0) // + + subq t0, 1, t0 // decrement blocks remaining + addq a0, 64, a0 // increment destination pointer + bne t0, 30b // more blocks to write + + + +// +// Fill aligned quadwords +// + +40: + srl a1, 3, t0 // t0 = number of quadwords + bne t0, 55f // if ne quadwords left to fill + br zero, 60f // if eq no quadwords left + +55: + and a1, 8-1, a1 // a1 = residual bytes to fill + +50: + stq a2, 0(a0) // store quadword + subq t0, 1, t0 // decrement quadwords remaining + addq a0, 8, a0 // next quadword + bne t0, 50b // more quadwords to write + + +// +// Fill bytes for last quadword +// + +60: + bne a1, 65f // if ne bytes remain to be filled + br zero, 80f // if eq no more bytes to fill + +65: + ldq t1, 0(a0) // get last quadword + bis zero, zero, t0 // t0 = byte position to start fill + +70: + beq a1, 75f // if eq, no more bytes to fill + insbl a2, t0, t2 // get fill byte into position + mskbl t1, t0, t1 // clear fill byte position + bis t1, t2, t1 // insert fill byte + addq t0, 1, t0 // increment byte within quad + subq a1, 1, a1 // decrement bytes to fill + cmpeq t0, 8, t3 // t0 = 8? => finished quad + beq t3, 70b // if eq [false] more bytes to fill + +75: + stq t1, 0(a0) // write merged quadword + +// +// Finish up +// + +80: + ret zero, (ra) // return + + + .end RtlZeroMemory + + SBTTL("Fill Memory Ulong") +//++ +// +// VOID +// RtlFillMemoryUlong ( +// IN PVOID Destination, +// IN ULONG Length, +// IN ULONG Pattern +// ) +// +// Routine Description: +// +// This function fills memory with the specified longowrd pattern by +// filling 64-byte blocks followed by 8-byte blocks and finally +// 4-byte blocks. +// +// N.B. This routine assumes that the destination address is aligned +// on a longword boundary and that the length is an even multiple +// of longwords. +// +// Arguments: +// +// Destination (a0) - Supplies a pointer to the memory to fill. +// +// Length (a1) - Supplies the length, in bytes, of the memory to be filled. +// +// Pattern (a2) - Supplies the fill pattern. +// +// Return Value: +// +// None. +// +//-- + + LEAF_ENTRY(RtlFillMemoryUlong) + + bic a1, 3, a1 // make sure length is an even number + // of longwords + sll a2, 32, a3 // a3 = long pattern in upper 32 bits + srl a3, 32, t0 // clear upper bits, pattern in lower 32 + bis a3, t0, a3 // a3 = quad version of fill pattern + +// +// Make destination address quad-aligned +// + + and a0, 4, t0 // is a0 quad aligned? + beq t0, 10f // if eq, then a0 quad aligned + stl a2, 0(a0) // fill first longword + addq a0, 4, a0 // quad align a0 + subq a1, 4, a1 // bytes remaining to store + +// +// Check for 64-byte blocks to fill +// + +10: + srl a1, 6, t0 // t0 = # 64-byte blocks to fill + beq t0, 30f // if eq no 64 byte blocks + and a1, 64-1, a1 // a1 = residual bytes + +20: + stq a3, 0(a0) // store 64 bytes + stq a3, 8(a0) // + stq a3, 16(a0) // + stq a3, 24(a0) // + stq a3, 32(a0) // + stq a3, 40(a0) // + stq a3, 48(a0) // + stq a3, 56(a0) // + subq t0, 1, t0 // t0 = blocks remaining + addq a0, 64, a0 // increment address pointer + bne t0, 20b // if ne more blocks to fill + +// +// Fill 8 bytes at a time while we can, a1 = bytes remaining +// + +30: + srl a1, 3, t0 // t0 = # quadwords to fill + beq t0, 50f // if eq no quadwords left + and a1, 8-1, a1 // a1 = residual bytes +40: + stq a3, 0(a0) // store quadword + subq t0, 1, t0 // t0 = quadwords remaining + addq a0, 8, a0 // increment address pointer + bne t0, 40b // if ne more quadwords to fill + +// +// Fill last 4 bytes +// + +50: + beq a1, 60f // if eq no longwords remain + stl a2, 0(a0) // fill last longword + +// +// Finish up +// + +60: + ret zero, (ra) // return to caller + + + .end RtlFillMemoryUlong + + SBTTL("Copy Memory With Byte Granularity") +//++ +// +// VOID +// RtlCopyBytes ( +// IN PVOID Destination, +// IN PVOID Source, +// IN ULONG Length +// ) +// +// Routine Description: +// +// This function copies non-overlapping memory, aligned or unaligned, in +// 64-byte blocks, followed by 8-byte blocks, followed by any remaining +// bytes. Unlike RtlCopyMemory or RtlMoveMemory the copy is done such +// that byte granularity is assured for all platforms. +// +// Arguments: +// +// Destination (a0) - Supplies a pointer to the destination address of +// the move operation. +// +// Source (a1) - Supplies a pointer to the source address of the move +// operation. +// +// Length (a2) - Supplies the length, in bytes, of the memory to be moved. +// +// Return Value: +// +// None. +// +//-- + + LEAF_ENTRY(RtlCopyBytes) + +// +// Move memory forward aligned and unaligned. +// + + xor a0, a1, t0 // compare alignment bits + and t0, 0x7, t0 // isolate alignment comparison + bne t0, CopyForwardUnaligned // if ne, incompatible alignment + +// +// Source and Destination buffers have the same alignment. Move +// bytes until done or source and destination are quadword aligned +// + + and a0, 0x7, t0 // t0 = unaligned bits + bne t0, 5f // if ne, not quad aligned + br zero, 20f // predicted taken +5: + bis zero, zero, t1 // t4 = destination byte zap mask + bis zero, 1, t2 + sll t2, t0, t2 // t2 = next bit to set in zap mask +10: + beq a2, 15f // if eq, all bits set + bis t1, t2, t1 // set bit in zap mask + sll t2, 1, t2 // set next higher bit for zap mask + subq a2, 1, a2 // decrement bytes to move + addq t0, 1, t0 // increment byte within quad + cmpeq t0, 8, t3 // finished the quadword? + beq t3, 10b // if eq [false], do next byte +15: + ldq_u t2, 0(a1) // get unaligned quadword from source + zapnot t2, t1, t2 // clear source bytes + bic a0, 7, a3 // a3 = quadword base of destination +retry1: + ldq_l t0, 0(a3) // load destination quadword + zap t0, t1, t0 // clear destination bytes + or t0, t2, t0 // merge in bytes from source + stq_c t0, 0(a3) // store merged quadword conditional + beq t0, retry1f // if eq, retry failed interlock + + addq a0, 7, a0 // move to next quadword + bic a0, 7, a0 // aligned quadword + + addq a1, 7, a1 // move to next quadword + bic a1, 7, a1 // aligned quadword + +// +// Check for 64-byte block moves +// + +20: + srl a2, 6, t0 // t0 = number of 64 byte blocks + beq t0, 40f // if eq no blocks to move + and a2, 64-1, a2 // a2 = residual bytes + +30: + ldq t1, 0(a1) // load 64 bytes from source + addq a0, 64, a0 // increment destination pointer + ldq v0, 56(a1) // + ldq a3, 32(a1) // + stq t1, -64(a0) // write to destination + ldq t2, 8(a1) // into volatile registers + ldq t3, 16(a1) // + ldq t4, 24(a1) // + subq t0, 1, t0 // decrement number of blocks + stq t2, -56(a0) // + ldq a4, 40(a1) // + stq t3, -48(a0) // + ldq a5, 48(a1) // + stq t4, -40(a0) // + addq a1, 64, a1 // increment source pointer + stq a3, -32(a0) // + stq a4, -24(a0) // + stq a5, -16(a0) // + stq v0, -8(a0) // + bne t0, 30b // if ne, more blocks to copy + +// +// Copy quadwords +// + +40: + srl a2, 3, t0 // t0 = number of quadwords to move + beq t0, 60f // if eq no quadwords to move + and a2, 8-1, a2 // a2 = residual bytes + +50: + ldq t1, 0(a1) // load quadword from source + addq a1, 8, a1 // increment source pointer + stq t1, 0(a0) // store quadword to destination + addq a0, 8, a0 // increment destination pointer + subq t0, 1, t0 // decrement number of quadwords + bne t0, 50b // if ne, more quadwords to move + +// +// Move final residual bytes +// + +60: + beq a2, 80f // if eq, no more bytes to move + mov a2, t0 // t0 = number of bytes to move + mov -1, t1 // t1 = bit mask + sll t0, 3, t0 // # of bytes to # of bits + srl t1, t0, t1 // clear t0 bits + sll t1, t0, t0 // move it back + ldq t1, 0(a1) // get last source quadword + bic t1, t0, t1 // clear bytes not copied + not t0, t0 // complement to clear destination +retry2: + ldq_l t2, 0(a0) // get last destination quadword locked + bic t2, t0, t2 // clear bytes to be copied + bis t2, t1, t2 // move bytes from source + stq_c t2, 0(a0) // store merged quadword conditional + beq t2, retry2f // if eq, retry failed interlock + +// +// Finish aligned MoveForward +// + +80: + ret zero, (ra) // return + +// +// Move memory forward unaligned. +// + +CopyForwardUnaligned: // + +// +// Move bytes until the destination is aligned +// + + and a0, 0x7, t0 // t0 = unaligned bits + beq t0, 100f // if eq, destination quad aligned + bis zero, zero, t1 // t4 = destination byte zap mask + bis zero, 1, t2 + sll t2, t0, t2 // t2 = next bit to set in zap mask + mov zero, t4 // assemble destination bytes here +90: + beq a2, 95f // if eq no more bytes to move + bis t1, t2, t1 // set bit in zap mask + sll t2, 1, t2 // set next higher bit for zap mask + ldq_u t5, 0(a1) // get unaligned quad from source + extbl t5, a1, t5 // extract source byte + insbl t5, t0, t5 // t5 = source byte, in position + or t4, t5, t4 // merge in source byte + addq t0, 1, t0 // increment byte position + addq a1, 1, a1 // increment source pointer + subq a2, 1, a2 // decrement bytes to move + cmpeq t0, 8, t3 // t0 = 8? => quad finished + beq t3, 90b // if eq [false], more bytes to move +95: + bic a0, 0x7, a3 // a3 = quadword base of destination +retry3: + ldq_l t0, 0(a3) // load destination quadword + zap t0, t1, t0 // clear destination bytes + or t0, t4, t0 // merge in bytes from source + stq_c t0, 0(a3) // store merged quadword conditional + beq t0, retry3f // if eq, retry failed interlock + + addq a0, 7, a0 // increment to next quad + bic a0, 7, a0 // align next quadword + +// +// Check for 64-byte blocks to move +// + +100: + srl a2, 6, t0 // t0 = number of blocks to move + beq t0, 120f // if eq no blocks to move + and a2, 64-1, a2 // a2 = residual bytes to move + + ldq_u t1, 0(a1) // t1 = first unaligned quad +110: + // get source data and merge it + // as we go + ldq_u t2, 7(a1) // t2 = second unaligned quad + extql t1, a1, t1 // extract applicable bytes from t1 + extqh t2, a1, v0 // extract applicable bytes from t2 + bis t1, v0, t1 // t1 = quad #1 + ldq_u t3, 15(a1) // t3 = third unaligned quad + extql t2, a1, t2 // extract applicable bytes from t2 + extqh t3, a1, v0 // extract applicable bytes from t3 + stq t1, 0(a0) // store quad #1 + bis t2, v0, t2 // t2 = quad #2 + ldq_u t4, 23(a1) // t4 = fourth unaligned quad + extql t3, a1, t3 // extract applicable bytes from t3 + extqh t4, a1, v0 // extract applicable bytes from t4 + stq t2, 8(a0) // store quad #2 + bis t3, v0, t3 // t3 = quad #3 + ldq_u t5, 31(a1) // t5 = fifth unaligned quad + extql t4, a1, t4 // extract applicable bytes from t4 + extqh t5, a1, v0 // extract applicable bytes from t5 + stq t3, 16(a0) // store quad #3 + bis t4, v0, t4 // t4 = quad #4 + ldq_u a3, 39(a1) // a3 = sixth unaligned quad + extql t5, a1, t5 // extract applicable bytes from t5 + extqh a3, a1, v0 // extract applicable bytes from a3 + stq t4, 24(a0) // store quad #4 + bis t5, v0, t5 // t5 = quad #5 + ldq_u a4, 47(a1) // a4 = seventh unaligned quad + extql a3, a1, a3 // extract applicable bytes from a3 + extqh a4, a1, v0 // extract applicable bytes from a4 + stq t5, 32(a0) // store quad #5 + bis a3, v0, a3 // a3 = quad #6 + ldq_u a5, 55(a1) // a5 = eighth unaligned quad + extql a4, a1, a4 // extract applicable bytes from a4 + extqh a5, a1, v0 // extract applicable bytes from a5 + stq a3, 40(a0) // store quad #6 + bis a4, v0, a4 // a4 = quad #7 + ldq_u t1, 63(a1) // t1 = ninth unaligned = 1st of next + extql a5, a1, a5 // extract applicable bytes from a5 + extqh t1, a1, v0 // extract applicable bytes from t1 + stq a4, 48(a0) // store quad #7 + bis a5, v0, a5 // a5 = quad #8 + addq a1, 64, a1 // increment source pointer + stq a5, 56(a0) // store quad #8 + addq a0, 64, a0 // increment destination pointer + subq t0, 1, t0 // decrement number of blocks + bne t0, 110b // if ne, more blocks to move + +// +// Move unaligned source quads to aligned destination quads +// + +120: + srl a2, 3, t0 // t0 = number of quads to move + beq t0, 140f // if eq no quads to move + and a2, 8-1, a2 // a2 = residual bytes + + + ldq_u t1, 0(a1) // t1 = first unaligned quad +130: + ldq_u t2, 7(a1) // t2 = second unaligned quad + addq a0, 8, a0 // increment destination pointer + extql t1, a1, t1 // extract applicable bytes from t1 + extqh t2, a1, v0 // extract applicable bytes from t2 + bis t1, v0, t1 // t1 = quadword of data + stq t1, -8(a0) // store data to destination + addq a1, 8, a1 // increment source pointer + subq t0, 1, t0 // decrement quads to move + bis t2, zero, t1 // t1 = first of next unaligned pair + bne t0, 130b // if ne, more quads to move + +// +// Move remaining bytes to final quadword +// + +140: + beq a2, 160f // if eq no more bytes to move + + mov zero, t3 // t3 = position for next insertion + mov zero, t4 // assemble destination bytes here + mov a2, t0 // t0 = number of bytes to move + mov -1, t1 // t1 = bit mask + sll t0, 3, t0 // # of bytes to # of bits + srl t1, t0, t1 // clear t0 bits + sll t1, t0, t0 // move it back + not t0, t0 // complement for destination clear mask +150: + ldq_u t1, 0(a1) // get unaligned source quad + extbl t1, a1, t1 // t1 = source byte + insbl t1, t3, t1 // t1 = source byte, in position + bis t4, t1, t4 // merge in source byte + addq a1, 1, a1 // increment source pointer + subq a2, 1, a2 // decrement bytes to move + addq t3, 1, t3 // increment destination position + bne a2, 150b // more bytes to move +retry4: + ldq_l t2, 0(a0) // get last destination quadword locked + bic t2, t0, t2 // clear bytes to be copied + bis t2, t4, t2 // move bytes from source + stq_c t2, 0(a0) // store merged quadword conditional + beq t2, retry4f // if eq, retry failed interlock + +// +// Finish unaligned MoveForward +// + +160: + ret zero, (ra) // return + +// +// Out of line branches for failed store conditional. +// Don't need to restore anything, just try again. +// + +retry1f: + br retry1 +retry2f: + br retry2 +retry3f: + br retry3 +retry4f: + br retry4 + + .end RtlCopyBytes + + SBTTL("Zero Bytes") +//++ +// +// VOID +// RtlZeroBytes ( +// IN PVOID Destination, +// IN ULONG Length +// ) +// +// Routine Description: +// +// This function zeros memory by first aligning the destination address to +// a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte +// blocks, followed by any remaining bytes. Unlike RtlZeroMemory the copy is +// done such that byte granularity is assured for all platforms. +// +// Arguments: +// +// Destination (a0) - Supplies a pointer to the memory to zero. +// +// Length (a1) - Supplies the length, in bytes, of the memory to be zeroed. +// +// Return Value: +// +// None. +// +//-- + + LEAF_ENTRY(RtlZeroBytes) + + bis zero, zero, a2 // set fill pattern + br zero, RtlpFillBytes // + + + SBTTL("Fill Bytes") +//++ +// +// VOID +// RtlFillBytes ( +// IN PVOID Destination, +// IN ULONG Length, +// IN UCHAR Fill +// ) +// +// Routine Description: +// +// This function fills memory by first aligning the destination address to +// a longword boundary, and then filling 32-byte blocks, followed by 4-byte +// blocks, followed by any remaining bytes. Unlike RtlFillMemory the copy is +// done such that byte granularity is assured for all platforms. +// +// Arguments: +// +// Destination (a0) - Supplies a pointer to the memory to fill. +// +// Length (a1) - Supplies the length, in bytes, of the memory to be filled. +// +// Fill (a2) - Supplies the fill byte. +// +// N.B. The alternate entry memset expects the length and fill arguments +// to be reversed. It also returns the Destination pointer +// +// Return Value: +// +// None. +// +//-- + + ALTERNATE_ENTRY(RtlFillBytes) + + and a2, 0xff, a2 // clear excess bits + sll a2, 8, t0 // duplicate fill byte + bis a2, t0, a2 // generate fill word + sll a2, 16, t0 // duplicate fill word + bis a2, t0, a2 // generate fill longword + sll a2, 32, t0 // duplicate fill longword + bis a2, t0, a2 // generate fill quadword + +.align 3 // ensure quadword aligned target +// +// Fill memory with the pattern specified in register a2. +// + +RtlpFillBytes: // + +// +// Align destination to quadword +// + + beq a1, 80f // anything to fill? (paranoia) + and a0, 8-1, t0 // t0 = unaligned bits + bne t0, 5f // if ne, then not quad aligned + br zero, 20f // if eq, then quad aligned + +5: + bis zero, zero, t1 // t4 = destination byte zap mask + bis zero, 1, t2 + sll t2, t0, t2 // t2 = next bit to set in zap mask +10: + beq a1, 15f // if eq, all bits set + bis t1, t2, t1 // set bit in zap mask + sll t2, 1, t2 // set next higher bit for zap mask + subq a1, 1, a1 // decrement bytes to fill + addq t0, 1, t0 // increment byte within quad + cmpeq t0, 8, t3 // finished the quadword? + beq t3, 10b // if eq [false], do next byte +15: + zapnot a2, t1, t2 // clear fill bytes + bic a0, 7, a3 // a3 = quadword base of destination +retry5: + ldq_l t0, 0(a3) // load destination quadword + zap t0, t1, t0 // clear destination bytes + or t0, t2, t0 // merge in fill bytes + stq_c t0, 0(a3) // store merged quadword conditional + beq t0, retry5f // if eq, retry failed interlock + + addq a0, 7, a0 // move a0 to next quadword + bic a0, 7, a0 // align a0 to quadword + +// +// Check for 64-byte blocks +// + +20: + srl a1, 6, t0 // t0 = number of 64 byte blocks + beq t0, 40f // if eq then no 64 byte blocks + and a1, 64-1, a1 // a1 = residual bytes to fill + +30: + stq a2, 0(a0) // store 64 bytes + stq a2, 8(a0) // + stq a2, 16(a0) // + stq a2, 24(a0) // + stq a2, 32(a0) // + stq a2, 40(a0) // + stq a2, 48(a0) // + stq a2, 56(a0) // + + subq t0, 1, t0 // decrement blocks remaining + addq a0, 64, a0 // increment destination pointer + bne t0, 30b // more blocks to write + + + +// +// Fill aligned quadwords +// + +40: + srl a1, 3, t0 // t0 = number of quadwords + bne t0, 55f // if ne quadwords left to fill + br zero, 60f // if eq no quadwords left + +55: + and a1, 8-1, a1 // a1 = residual bytes to fill + +50: + stq a2, 0(a0) // store quadword + subq t0, 1, t0 // decrement quadwords remaining + addq a0, 8, a0 // next quadword + bne t0, 50b // more quadwords to write + +// +// Fill bytes for last quadword +// + +60: + beq a1, 80f // if eq no more bytes to fill + + mov a1, t0 // t0 = number of bytes to move + mov -1, t1 // t1 = bit mask + sll t0, 3, t0 // # of bytes to # of bits + srl t1, t0, t1 // clear t0 bits + sll t1, t0, t0 // move it back + bic a2, t0, t1 // clear fill bytes not copied + not t0, t0 // complement to clear destination +retry6: + ldq_l t2, 0(a0) // get last destination quadword locked + bic t2, t0, t2 // clear bytes to be copied + bis t2, t1, t2 // move bytes from source + stq_c t2, 0(a0) // store merged quadword conditional + beq t2, retry6f // if eq, retry failed interlock + +// +// Finish up +// + +80: + ret zero, (ra) // return + +// +// Out of line branches for failed store conditional. +// Don't need to restore anything, just try again. +// + +retry5f: + br retry5 +retry6f: + br retry6 + + .end RtlZeroBytes |