path: root/private/ntos/rtl/ppc/largeint.s



//      TITLE("Large Integer Arithmetic")
//++
//
// Copyright (c) 1993  IBM Corporation
//
// Module Name:
//
//    largeint.s
//
// Abstract:
//
//    This module implements routines for performing extended integer
//    arithmtic.
//
// Author:
//
//    David N. Cutler (davec) 18-Apr-1990
//    Converted to PowerPC by Walt Daniels and Norman Cohen Aug 93
//       (from MIPS based code)
//
// References:
//    See PowerPC Architecture book Appendix E.2 for 64-bit shifts
//    See "Hacker's Delight", Hank Warren, Nov. 91 for fancy divides
//
// Environment:
//
//    Any mode.
//
// Revision History:
//    Fixed RtlExtendedLargeIntegerDivide       (Steve Johns) 18-Feb-94
//       - if divisor >= 2^16 && dividend >= 2^32, quotient incorrect
//       - also, removed 6 uncessary occurrences of CMPI
//
//--

#include "ksppc.h"


//++
//
// LARGE_INTEGER
// RtlLargeIntegerAdd (
//    IN LARGE_INTEGER Addend1,
//    IN LARGE_INTEGER Addend2
//    )
//
// Routine Description:
//
//    This function adds a signed large integer to a signed large integer and
//    returns the signed large integer result.
//
// Arguments:
//
//    Addend1 (r.5, r.6) - Supplies the first addend value.
//
//    Addend2 (r.7, r.8) - Supplies the second addend value.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlLargeIntegerAdd)

        addc    r.5,r.5,r.7             // add low parts of large integer
        adde    r.6,r.6,r.8             // add high parts with carry
        stw     r.5,0(r.3)              // store low 32-bits
        stw     r.6,4(r.3)              // store high 32-bits
        LEAF_EXIT(RtlLargeIntegerAdd)   // return


//++
//
// LARGE_INTEGER
// RtlConvertLongToLargeInteger (
//     IN LONG SignedInteger
//     )
//
// Routine Description:
//
//     This function converts the a signed integer to a signed large integer
//     and returns the result.
//
// Arguments:
//
//     SignedInteger (r.4) - Supplies the value to convert.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlConvertLongToLargeInteger)

        srawi   r.5,r.4,31              // compute high part of result
        stw     r.4,0(r.3)              // store low 32-bits
        stw     r.5,4(r.3)              // store high 32-bits
        LEAF_EXIT(RtlConvertLongToLargeInteger)  // return


//++
//
// LARGE_INTEGER
// RtlConvertUlongToLargeInteger (
//     IN LONG UnsignedInteger
//     )
//
// Routine Description:
//
//     This function converts the an unsigned integer to a signed large
//     integer and returns the result.
//
// Arguments:
//
//     UnsignedInteger (r.4) - Supplies the value to convert.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlConvertUlongToLargeInteger)

        li      r.5,0                   // clear high part
        stw     r.4,0(r.3)              // store low 32-bits
        stw     r.5,4(r.3)              // store high 32-bits
        LEAF_EXIT(RtlConvertUlongToLargeInteger) // return


//++
//
// LARGE_INTEGER
// RtlEnlargedIntegerMultiply (
//    IN LONG Multiplicand,
//    IN LONG Multiplier
//    )
//
// Routine Description:
//
//    This function multiplies a signed integer by an signed integer and
//    returns a signed large integer result.
//
// Arguments:
//
//    Multiplicand (r.4) - Supplies the multiplicand value.
//
//    Multiplier (r.5) - Supplies the multiplier value.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlEnlargedIntegerMultiply)

        mullw   r.6,r.4,r.5             // keep low 32-bits of result
        mulhw   r.7,r.4,r.5             // keep high 32-bits of result
        stw     r.6,0(r.3)              // store low 32-bits
        stw     r.7,4(r.3)              // store high 32-bits
        LEAF_EXIT(RtlEnlargedIntegerMultiply) // return


//++
//
// LARGE_INTEGER
// RtlEnlargedUnsignedMultiply (
//    IN ULONG Multiplicand,
//    IN ULONG Multiplier
//    )
//
// Routine Description:
//
//    This function multiplies an unsigned integer by an unsigned integer
//    and returns a signed large integer result.
//
// Arguments:
//
//    Multiplicand (r.4) - Supplies the multiplicand value.
//
//    Multiplier (r.5) - Supplies the multiplier value.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlEnlargedUnsignedMultiply)

        mullw   r.6,r.4,r.5             // keep low-32-bits
        mulhwu  r.7,r.4,r.5             // keep high 32-bits
        stw     r.6,0(r.3)              // store low 32-bits
        stw     r.7,4(r.3)              // store high 32-bits
        LEAF_EXIT(RtlEnlargedUnsignedMultiply) // return


//++
//
// ULONG
// RtlEnlargedUnsignedDivide (
//    IN ULARGE_INTEGER Dividend,
//    IN ULONG Divisor,
//    IN PULONG Remainder.
//    )
//
// Routine Description:
//
//    This function divides an unsigned large integer by an unsigned long
//    and returns the resultant quotient and optionally the remainder.
//
//    N.B. It is assumed that no overflow will occur.
//
// Arguments:
//
//    Dividend (r.3, r.4) - Supplies the dividend value.
//       (High-order bits in r.4, low-order bits in r.3)
//
//    Divisor (r.5) - Supplies the divisor value.
//
//    Remainder (r.6) - Supplies an optional pointer to a variable that
//        receives the remainder. Ptr is null if not needed.
//
// Return Value:
//
//    The unsigned long integer quotient is returned as the function value.
//
//--

        LEAF_ENTRY(RtlEnlargedUnsignedDivide)

        cmplw   r.4,r.5
        bge     overflow                // catch overflow or division by 0
        cmplwi  r.4,0                   // test high part for 0
        beq     only_32_bits            // 32-bit division suffices
// Normalize:  Shift divisor and dividend left to get rid of leading zeroes
// in the divisor.  Since r.4 < r.5, only zeroes are shifted out of the
// dividend.
        cntlzw  r.7,r.5                // number of bits to shift (N)
        slw     r.5,r.5,r.7            // shift divisor
        slw     r.4,r.4,r.7            // shift upper part of divisor
        subfic  r.9,r.7,32             // 32-N
        srw     r.9,r.3,r.9            // leftmost N bits of r.3, slid right
        or      r.4,r.4,r.9            //   and inserted into low end of r.4
        slw     r.3,r.3,r.7            // shift lower part of divisor
// Estimate high-order halfword of quotient.  If the dividend is
// A0 A1 A2 A3 and the divisor is B0 B1  (where each Ai or Bi is a halfword),
// then the estimate is A0 A1 0000 divided by B0 0000, or A0 A1 divided by B0.
// (r.4 holds A0 A1, r.3 holds A2 A3, and r.5 holds B0 B1.)
// The estimate may be too high because it does not account for B1; in rare
// cases, the estimate will not even fit in a halfword.  High estimates are
// corrected for later.
        srwi    r.8,r.5,16             // r.8 <- B0
        divwu   r.12,r.4,r.8           // r.12 <- floor([A0 A1]/B0)
// Subtract partial quotient times divisor from dividend: If Q0 is the quotient
// computed above, this means that Q0 0000 times B0 B1 is subtracted from
// A0 A1 A2 A3.  We compute Q0 times B0 B1 and then shift the two-word
// product left 16 bits.
        mullw   r.9,r.12,r.5           // low word of Q0 times B0 B1
        mulhwu  r.10,r.12,r.5          // high word of Q0 times B0 B1
        slwi    r.10,r.10,16           // shift high word left 16 bits
        inslwi  r.10,r.9,16,16         // move 16 bits from left of low word to
                                       //   right of high word
        slwi    r.9,r.9,16             // shift low word left 16 bits
        subfc   r.3,r.9,r.3            // low word of difference
        subfe   r.4,r.10,r.4           // high word of difference
// If the estimate for Q0 was too high, the difference will be negative.
// While A0 A1 A2 A3 is negative, repeatedly add B0 B1 0000 to A0 A1 A2 A3
// and decrement Q0 by one to correct for the overestimate.
        cmpwi   r.4,0                  // A0 A1 A2 A3 is negative iff A0 A1 is
        bge     Q0_okay                // no correction needed
        inslwi  r.10,r.5,16,16         // high word of B0 B1 0000 (= 0000 B0)
        slwi    r.9,r.5,16             // low word of B0 B1 0000 (= B1 0000)
adjust_Q0:
        addc   r.3,r.3,r.9             // add B0 B1 0000 to A0 A1 A2 A3 (low)
        adde   r.4,r.4,r.10            // add B0 B1 0000 to A0 A1 A2 A3 (high)
        cmpwi  r.4,0                   // Is A0 A1 A2 A3 now nonnegative?
        addi   r.12,r.12,-1            // decrement Q0
        blt    adjust_Q0               // if A0 A1 A2 A3 still negative, repeat
Q0_okay:
// Estimate low-order halfword of quotient.  A0 is necessarily 0000 at this
// point, so if the remaining part of the dividend is A0 A1 A2 A3 then the
// estimate is A1 A2 0000 divided by B0 0000, or A1 A2 divided by B0.
// (r.4 holds A0 A1, r.3 holds A2 A3, and r.8 holds B0.)
        slwi    r.9,r.4,16             // r.9 <- A1 0000
        inslwi  r.9,r.3,16,16          // r.9 <- A1 A2
        divwu   r.11,r.9,r.8           // r.11 <- floor([A1 A2]/B0)
// Subtract partial quotient times divisor from remaining part of dividend:
// If Q1 is the quotient computed above, this means
// that Q1 times B0 B1 is subtracted from A0 A1 A2 A3.  We compute
        mullw   r.9,r.11,r.5           // low word of Q1 times B0 B1
        mulhwu  r.10,r.11,r.5          // high word of Q1 times B0 B1
        subfc   r.3,r.9,r.3            // low word of difference
        subfe   r.4,r.10,r.4           // high word of difference
// If the estimate for Q1 was too high, the difference will be negative.
// While A0 A1 A2 A3 is negative, repeatedly add B0 B1 to A0 A1 A2 A3
// and decrement Q1 by one to correct for the overestimate.
        cmpwi   r.4,0                  // A0 A1 A2 A3 is negative iff A0 A1 is
        bge     Q1_okay                // no correction needed
adjust_Q1:
        addc   r.3,r.3,r.5             // add B0 B1 to A0 A1 A2 A3 (low)
        addze  r.4,r.4                 // add B0 B1 to A0 A1 A2 A3 (high)
        cmpwi  r.4,0                   // Is A0 A1 A2 A3 now nonnegative?
        addi   r.11,r.11,-1            // decrement Q1
        blt    adjust_Q1               // if A0 A1 A2 A3 still negative, repeat
Q1_okay:
// Build the results.  The desired quotient is Q0 Q1.
// The desired remainder is obtained by shifting A2 A3 right by the number
// of bits by which the dividend and divisor were shifted left in the
// normalization step.  The number of bits shifted is still in r.7.
        cmplwi r.6,0                   // remainder needed?
        bne    rem1                    // if so, go compute it
        slwi   r.3,r.12,16             // r.3 <- Q0 0000
        or     r.3,r.3,r.11            // r.3 <- Q0 Q1
        blr
rem1:
        srw    r.8,r.3,r.7             // remainder <- [A2 A3] >> (r.7)
        slwi   r.3,r.12,16             // r.3 <- Q0 0000
        stw    r.8,0(r.6)              // store remainder
        or     r.3,r.3,r.11            // r.3 <- Q0 Q1
        blr
//
// End of normal case
//
// The case of a 32-bit dividend:
only_32_bits:
        cmplwi  r.6,0                   // remainder needed?
        bne     rem2                    // if so, go compute quotient+remainder
        divwu   r.3,r.3,r.5             // result <- dividend/divisor
        blr
rem2:
        divwu   r.7,r.3,r.5             // quotient <- dividend / divisor
        mullw   r.8,r.7,r.5             // r.8 <- quotient * divisor
        subf    r.8,r.8,r.3             // remainder<-dividend-quotient*divisor
        mr      r.3,r.7                 // result <- quotient
        stw     r.8,0(r.6)              // store remainder
        blr
// The error cases:
overflow:
        twi     6,r.5,0                 // trap if divide by zero
        twi     0x1b,r.5,0              // trap on overflow

        LEAF_EXIT(RtlEnlargedUnsignedDivide)


//++
//
// ULARGE_INTEGER
// RtlExtendedLargeIntegerDivide (
//    IN ULARGE_INTEGER Dividend,
//    IN ULONG Divisor,
//    IN PULONG Remainder.
//    )
//
// Routine Description:
//
//    This function divides an unsigned large integer by an unsigned long
//    and returns the resultant quotient and optionally the remainder.
//
// Arguments:
//
//    Dividend (r.5, r.6) - Supplies the dividend value.
//
//    Divisor (r.7) - Supplies the divisor value.
//
//    Remainder (r.8)- Supplies an optional pointer to a variable
//      that receives the remainder.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlExtendedLargeIntegerDivide)

        cmplwi  r.7,0                   // zero divisor?
        beq     div_zero_s              // if so, branch to error exit
        cmpwi   r.6,0                   // check sign of dividend high word
        bne     big_dividend

// The high-order word of the dividend is zero, so 32-bit unsigned division
// can be used.
        li      r.12,0                  // upper word of quotient is zero
        stw     r.12,4(r.3)             // store upper word of quotient
        divwu   r.11,r.5,r.7            // compute lower word of quotient
        stw     r.11,0(r.3)             // store lower word of quotient
        cmplwi  r.8,0                   // remainder needed?
        beqlr                           // if not, return
        mullw   r.10,r.11,r.7           // quotient * divisor
        subf    r.9,r.10,r.5            // dividend - quotient * divisor
        stw     r.9,0(r.8)              // store remainder
        blr                             // return

big_dividend:
        srwi.   r.0,r.7,16              // upper 16 bits of divisor
        bne     long_division           // if not, must use long division

// The divisor is only one 16-bit digit long, so use short division:
        srwi    r.0,r.6,16              // first 16-bit digit of dividend
        divwu   r.4,r.0,r.7             // first 16-bit digit of quotient
        mullw   r.10,r.4,r.7            // amount to subtract for remainder
        subf    r.9,r.10,r.0            // remainder from first digit
        insrwi  r.6,r.9,16,0            // combine rmndr with 2nd digit of dvdnd
        divwu   r.12,r.6,r.7            // second digit of quotient
        insrwi  r.12,r.4,16,0           // high two quotient digits in one word
        mullw   r.10,r.12,r.7           // amount to subtract for remainder
        subf    r.9,r.10,r.6            // remainder from second digit
        srwi    r.0,r.5,16              // third digit of dividend
        insrwi  r.0,r.9,16,0            // combine rmndr with 3rd digit of dvdnd
        divwu   r.4,r.0,r.7             // third digit of quotient
        mullw   r.10,r.4,r.7            // amount to subtract for remainder
        subf    r.9,r.10,r.0            // remainder from third digit
        insrwi  r.5,r.9,16,0            // combine rmndr with 4th digit of dvdnd
        divwu   r.11,r.5,r.7            // fourth digit of quotient
        mullw   r.10,r.11,r.7           // amount to subtract for remainder
        insrwi  r.11,r.4,16,0           // low two quotient digits in one word
        subf    r.9,r.10,r.5            // remainder from fourth digit
        b       store_results

long_division:
// Since the divisor is more than one 16-bit digit long, the quotient will
// be of the form 0x0000 Q2 Q3 Q4, where each of Q2, Q3, and Q4 is a 16-bit
// digit.
//
// Normalize the divisor and dividend so that the high-order bit of the
// divisor is 1.  This normalization must be undone after the division to
// compute the remainder.  Let U1 U2 U3 U4 be the 16-bit digits of the
// unnormalized dividend.  Each digit Ui consists of an S-bit high-order part
// UiH and a (16-S)-bit low-order part UiL, where S is the number of leading
// zeroes in the divisor.  Thus R6 holds U1H U1L U2H U2L and R5 holds
// U3H U3L U4H U4L.  Let N0 N1 N2 N3 N4 be the 16-bit digits of the  normalized
// dividend: N0 = U1H, N1 = U1L U2H, N2 = U2L U3H, N3 = U3L U4H, N4 = U4L 0...0.
// Let D1 D2 be the normalized divisor.
        cntlzw  r.0,r.7                 // number of bits to shift left (S)
        subfic  r.12,r.0,16             // 16-S
        subfic  r.11,r.0,32             // 32-S
        srw     r.9,r.6,r.12            // U1H U1L U2H = N0 N1
        srw     r.4,r.5,r.11            // U3H
        slw     r.10,r.6,r.0            // U1L U2H U2L 0...0
        or      r.6,r.4,r.10            // U1L U2H U2L U3H = N1 N2
        slw     r.4,r.5,r.0             // U3L U4H U4L 0...0 = N3 N4
        slw     r.7,r.7,r.0             // normalized divisor (D1 D2)
        srwi    r.11,r.7,16             // D1
// Set Q2 = [N0 N1 N2] / [D1 D2].  Start by guessing Q2 = [N0 N1] / D1, then
// adjust if necessary.  This guess will occasionally be one too high and
// very rarely two too high, but never higher than that and never too low.
// (See Theorem B in Section 4.3.1 of Knuth, Vol. II, pp. 256-7.)
// Let C N0' N1' N2' be the partial remainder [N0 N1 N2] - Q2 * [D1 D2].
        divwu   r.12,r.9,r.11           // guess Q2 = [N0 N1] / D1
        srwi    r.10,r.9,16             // 0x0000 N0
        mullw   r.5,r.12,r.7            // low word of Q2 * [D1 D2]
        subfc   r.9,r.5,r.6             // low word of C N0' N1' N2' (N1' N2')
        mulhwu  r.5,r.12,r.7            // high word of Q2 * [D1 D2]
        subfe.  r.10,r.5,r.10           // high word of C N0' N1' N2' (C N0')
        bge     Q2_okay                 // if difference is >= 0, Q2 was not too high
adjust_Q2:
        addc    r.9,r.9,r.7             // low word of [C N0' N1' N2']+[D1 D2]
        addze.  r.10,r.10               // high word of [C N0' N1' N2']+[D1 D2]
        addi    r.12,r.12,-1            // Q2 - 1
        blt     adjust_Q2               // try again if still negative
Q2_okay:
// At this point 0 <= [C N0' N1' N2'] < [D1 D2], so [C N0'] = 0x00000000.
// r.12 holds the upper word of the quotient, 0x0000 Q2.
// Set Q3 = [N1' N2' N3] / [D1 D2] by guessing and adjusting as above.
// Let [N0" N1" N2" N3"] be the partial remainder [N1' N2' N3] - Q3 * [D1 D2].
        divwu   r.6,r.9,r.11            // guess Q3 = [N1' N2'] / D1
        srwi    r.10,r.9,16             // 0x0000 N1'
        slwi    r.9,r.9,16              // N2' 0x0000
        inslwi  r.9,r.4,16,16           // N2' N3
        mullw   r.5,r.6,r.7             // low word of Q3 * [D1 D2]
        subfc   r.9,r.5,r.9             // N2" N3"
        mulhwu  r.5,r.6,r.7             // high word of Q3 * [D1 D2]
        subfe.  r.10,r.5,r.10           // N0" N1"
        bge     Q3_okay                 // if difference >= 0, Q3 was not too high
adjust_Q3:
        addc    r.9,r.9,r.7             // low word of [N0" N1" N2" N3"]+[D1 D2]
        addze.  r.10,r.10               // high word [N0" N1" N2" N3"]+[D1 D2]
        addi    r.6,r.6,-1              // Q3 - 1
        blt     adjust_Q3               // try again if difference still negative
Q3_okay:
// At this point 0 <= [N0" N1" N2" N3"] < [D1 D2], so [N0" N1"] = 0x00000000.
// Set Q4 = [N2" N3" N4] / [D1 D2] by guessing and adjusting as above.
// Let [R1 R2 R3 R4] be the partial remainder [N2" N3" N4] - Q4 * [D1 D2].
        divwu   r.11,r.9,r.11           // guess Q4 = [N2" N3"] / D1
        insrwi  r.4,r.9,16,0            // N3" N4
        srwi    r.10,r.9,16             // 0x0000 N2"
        mullw   r.5,r.11,r.7            // low word of Q4 * [D1 D2]
        subfc   r.9,r.5,r.4             // R3 R4
        mulhwu  r.5,r.11,r.7            // high word of Q4 * [D1 D2]
        subfe.  r.10,r.5,r.10           // R1 R2
        bge     Q4_okay                 // if difference < 0, Q4 was not too high
adjust_Q4:
        addc    r.9,r.9,r.7             // low word of [R1 R2 R3 R4]+[D1 D2]
        addze.  r.10,r.10               // high word of [R1 R2 R3 R4]+[D1 D2]
        addi    r.11,r.11,-1            // Q4 - 1
        blt     adjust_Q4               // try again if partial remainder still negative
Q4_okay:
// At this point 0 <= [R1 R2 R3 R4] < [D1 D2], so [R3 R4] is the remainder.
        insrwi  r.11,r.6,16,0           // low word of quotient: Q3 Q4
        srw     r.9,r.9,r.0             // unnormalize remainder

store_results:
        stw     r.11,0(r.3)             // store low word of quotient
        stw     r.12,4(r.3)             // store high word of quotient
        cmplwi  r.8,0                   // remainder needed?
        beqlr                           // if not, return
        stw     r.9,0(r.8)              // store remainder
        blr                             // return

div_zero_s:
        twi     6,r.7,0                 // Trap on divide by zero

        LEAF_EXIT(RtlExtendedLargeIntegerDivide)


//++
//
// LARGE_INTEGER
// RtlExtendedMagicDivide (
//    IN LARGE_INTEGER Dividend,
//    IN ULARGE_INTEGER MagicDivisor,
//    IN CCHAR ShiftCount
//    )
//
// Routine Description:
//
//    This function divides a signed large integer by an unsigned large integer
//    and returns the signed large integer result. The division is performed
//    using reciprocal multiplication of a signed large integer value by an
//    unsigned large integer fraction which represents the most significant
//    64-bits of the reciprocal divisor rounded up in its least significant bit
//    and normalized with respect to bit 63. A shift count is also provided
//    which is used to truncate the fractional bits from the result value.
//    The value returned is the most significant 64 bits of the product
//    Dividend*MagicDivisor, shifted right ShiftCount bits.
//
// Arguments:
//
//    Dividend (r.5, r.6) - Supplies the dividend value.
//
//    MagicDivisor (r.7, r.8) - Supplies the magic divisor value
//       which is a 64-bit multiplicative reciprocal.
//
//    Shiftcount (r.9) - Supplies the right shift adjustment value,
//       assumed to be in the range 0 to 63.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--
// Let Dividend = A B and MagicDivisor = C D, where each of A, B, C, and D is
// a 32-bit word.  Then Dividend*MagicDivisor is a 128-bit product, computed
// as follows:
//                                                A              B
//                                   x            C              D
//            ==========================================================
//                                          high_word(B*D) low_word(B*D)
//                           high_word(A*D)  low_word(A*D)
//                           high_word(B*C)  low_word(B*C)
//            high_word(A*C)  low_word(A*C)
//            ==========================================================
//                 P1             P2            P3              P4
//
// Since the return value is [P1 P2] >> Shift_Count, P3 and P4 need not be
// computed, but the carry out of the P3 column must be computed to compute P2.

        LEAF_ENTRY(RtlExtendedMagicDivide)

// If the dividend is negative, negate it and record the fact by setting
// cr7 to LT.
        crclr   4*cr.7+0                // clear cr.7 LT bit
        cmpwi   r.6,0                   // is high-order word of divisor < 0?
        bge     divisor_nonnegative     // if not, we are ready to compute
        crset   4*cr.7+0                // set cr.7 LT bit to mark negation
        subfic  r.5,r.5,0               // negate lower half of dividend
        subfze  r.6,r.6                 // negate upper half of dividend
divisor_nonnegative:
// To avoid pipeline delays, produce partial products first, in the order they
// will be consumed by the addc and addze instructions below.
        mulhwu  r.11,r.6,r.7            // high(A*D)
        mulhwu  r.0,r.5,r.8             // high(B*C)
        mulhwu  r.12,r.6,r.8            // high(A*C)
        mullw   r.4,r.6,r.8             // low(A*C)
        mulhwu  r.10,r.5,r.7            // high(B*D)
        mullw   r.6,r.6,r.7             // low(A*D)
        mullw   r.7,r.5,r.8             // low(B*C)
// Now combine the partial products, forming P1 in r.12, P2 in r.11, P3 in r.10:
        addc    r.11,r.11,r.0           // high(A*D)+high(B*C)
        addze   r.12,r.12               // high(A*C)+partial carry
        addc    r.11,r.11,r.4           // high(A*D)+high(B*C)+low(A*C)
        addze   r.12,r.12               // high(A*C)+partial carry
        addc    r.10,r.10,r.6           // high(B*D)+low(A*D)
        addze   r.11,r.11               // hi(A*D)+hi(B*C)+low(A*C)+part. carry
        addze   r.12,r.12               // high(A*C)+partial carry
        addc    r.10,r.10,r.7           // high(B*D)+low(A*D)+low(B*C) = P3
        addze   r.11,r.11               // hi(A*D)+hi(B*C)+low(A*C)+carry = P2
        addze   r.12,r.12               // high(A*C)+carry = P1
// Shift the 64-bit value whose high half is in r.12 and whose low half is in
// r.11 right by (r.7) bits.  The sequence below depends on the fact that
// shift amounts are interpreted mod 64.  In particular, a shift amount of
// -N bits, where 0 < N <= 32, specifies a shift of 64-N bits, where
// 32 <= 64-N < 64, and a shift of 32 or more bits always yields zero.
// Let s = (r.9) be the amount to shift right.  The sequence below behaves
// differently when s<32, when s=32, and when s>32.  When s<32, we view the
// 64-bit value to be shifted as follows:
//
//    |             r.12             |             r.11             |
//    +-----------------+------------+-----------------+------------+
//    |        T        |     U      |        V        |     W      |
//    +-----------------+------------+-----------------+------------+
//    |<- (32-s) bits ->|<- s bits ->|<- (32-s) bits ->|<- s bits ->|
//
// When s >= 32, we view the 64-bit value to be shifted as follows:
//
//    |             r.12             |             r.11             |
//    +-----------------+------------+------------------------------+
//    |        X        |     Y      |              Z               |
//    +-----------------+------------+------------------------------+
//    |                 |<- (s-32) ->|<------------ 32 ------------>|
//    |<- (64-s) bits ->|<--------------- s bits ------------------>|
//
                                // When s < 32: | When s = 32: | When s > 32:
                                // -------------+--------------+-------------
        subfic  r.7,r.9,32      // 32-s (> 0)   |   0          | 32-s (< 0)
        addi    r.8,r.9,-32     // s-32 (< 0)   |   0          | s-32 (> 0)
        srw     r.11,r.11,r.9   // 0...0 V      |   0          | 0
        slw     r.4,r.12,r.7    // U 0...0      |   X (= X Y)  | 0
        or      r.11,r.11,r.4   // U V          |   X          | 0
        srw     r.4,r.12,r.8    // 0            |   X          | 0...0 X
        or      r.11,r.11,r.4   // U V          |   X          | 0...0 X
        srw     r.12,r.12,r.9   // 0...0 T      |   0          | 0

// If the original dividend was negated, we now negate the result:
        bnl     cr.7,sign_is_right
        subfic  r.11,r.11,0             // negate low word
        subfze  r.12,r.12               // negate high word
sign_is_right:
// Store the result:
        stw     r.11,0(r.3)             // low word of result
        stw     r.12,4(r.3)             // high word of result
        blr                             // return

        LEAF_EXIT(RtlExtendedMagicDivide)


//++
//
// ULARGE_INTEGER
// RtlLargeIntegerDivide (
//    IN ULARGE_INTEGER Dividend,
//    IN ULARGE_INTEGER Divisor,
//    IN PLARGE_INTEGER Remainder.
//    )
//
// Routine Description:
//
//    This function divides an unsigned large integer by an unsigned large
//    integer and returns the resultant quotient and optionally the remainder.
//
// Arguments:
//
//    Dividend (r.5, r.6) - Supplies the dividend value.
//
//    Divisor (r.7, r.8) - Supplies the divisor value.
//
//    Remainder (r.9)- Supplies an optional pointer to a variable
//      that receives the remainder.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlLargeIntegerDivide)

        or.     r.0,r.7,r.8             // combine low and high parts of divisor
        beq     div0                    // if 0, then attempted division by zero
        li      r.0,64                  // set loop count
        mtctr   r.0                     // in the count register
        li      r.10,0                  // clear partial remainder
        li      r.11,0                  //

// Invariants for the following loop:
//    1. (q<<(CTR)*Divisor) + [r.11 r.10 r.6 r.5]>>(64-(CTR)) = Dividend
//    2. [r.11 r.10] < Divisor
// where q is the rightmost (64-(CTR)) bits of [r.6 r.5]
// Initially, (CTR)=64 and [r.11 r.10] = 0,, so q=0 and
//    [r.11 r.10 r.6 r.5]>>(64-(CTR)) = [0 0 r.6 r.5]>>0 = [r.6 r.5], reducing
//    the loop invariants to:
//       1. [r.6 r.5] = Dividend
//       2. 0 < Divisor
// At the end of the loop, (CTR)=0, so q=[r.6 r.5] and the loop invariants
//    reduce to:
//       1. [r.6 r.5]*Divisor + [r.11 r.10] = Dividend
//       2. [r.11 r.10] < Divisor
//    That is, [r.6 r.5] holds the quotient and [r.11 r.10] holds the remainder.
// During execution of the loop, [r.11 r.10] holds the partial remainder, the
//    leftmost (CTR) bits of [r.6 r.5] hold the bits of the dividend not yet
//    appended to the partial remainder, and the remaining bits of [r.6 r.5]
//    hold the leftmost (64-(CTR)) bits of the quotient.

divl:
// Shift the 128-bit quantity [r.11 r.10 r.6 r.5] left one bit.  This has the
// effect of dropping the leftmost bit of the partial remainder (necessarily
// zero), "bringing down" the next bit of the dividend to the right end of the
// partial remainder, and shifting the partial quotient left one bit.
        slwi    r.11,r.11,1             // shift r.11 left one bit
        inslwi  r.11,r.10,1,31          // left bit of r.10 to right bit of r.11
        slwi    r.10,r.10,1             // shift r.10 left one bit
        inslwi  r.10,r.6,1,31           // left bit of r.6 to right bit of r.10
        slwi    r.6,r.6,1               // shift r.6 left one bit
        inslwi  r.6,r.5,1,31            // left bit of r.5 to right bit of r.6
        slwi    r.5,r.5,1               // shift r.5 left one bit
// If [r.11 r.10] >= Divisor, increment the quotient and subtract the divisor
// from the partial remainder:
        cmplw   cr.0,r.11,r.8           // high(partial_rem) vs. high(divisor)
        cmplw   cr.1,r.10,r.7           // low(partial_rem) vs. low(divisor)
        bgt     cr.0,PR_greater         // high(part_rem) > high(divisor)
        blt     cr.0,endl               // high(part_rem) < high(divisor)
        blt     cr.1,endl               // highs =, low(part_rem) < low(divisor)

PR_greater:
        ori     r.5,r.5,1               // increment shifted quotient
        subfc   r.10,r.7,r.10           // low(part_rem-divisor)
        subfe   r.11,r.8,r.11           // high(part_rem-divisor)

endl:   bdnz    divl                    // decrement CTR and loop

        cmplwi  r.9,0                   // remainder requested?
        beq     norem                   // no remainder
        stw     r.10,0(r.9)             // store low part of remainder
        stw     r.11,4(r.9)             // store high part of remainder
norem:  stw     r.5,0(r.3)              // store low part of quotient
        stw     r.6,4(r.3)              // store high part of quotient
        blr

div0:
        twi     6,r.0,0                 // Trap on divide by zero

        LEAF_EXIT(RtlLargeIntegerDivide) //


//++
//
// LARGE_INTEGER
// Rtl     ExtendedIntegerMultiply (
//    IN LARGE_INTEGER Multiplicand,
//    IN LONG Multiplier
//    )
//
// Routine Description:
//
//    This function multiplies a signed large integer by a signed integer and
//    returns the signed large integer result.
//
// Arguments:
//
//    Multiplicand (r.5, r.6) - Supplies the multiplicand value.
//
//    Multiplier (r.7) - Supplies the multiplier value.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlExtendedIntegerMultiply)

// If A B is the multiplicand and C is the multiplier (where A, B, and C are
// each 32 bits long), the product is computed as follows:
//
//                           A         B
//      *                              C
//         =============================
//                   high(B*C)  low(B*C)
//      +  high(A*C)  low(A*C)
//         =============================
//                P1        P2        P3
//
// Since the high-order bit of B is not a sign bit but the high-order bit of
// C is, we have no multiplication instruction appropriate for computing
// high(B*C) directly.  Instead, we negate any negative operand before
// doing the multiplication, multiply using unsigned arithmetic, and then
// negate the product if we had negated exactly one operand.  If P1 is
// nonzero before negating the product, the multiplication has overflowed.
// We use the LT bit of cr.7 to track whether exactly one operand has
// been negated.

        crclr   4*cr.7+0                // clear LT bit of cr.7
        cmpwi   r.6,0                   // test sign of multiplicand
        bge     multiplicand_adjusted   // if nonnegative, proceed
        subfic  r.5,r.5,0               // negate low part of multiplicand
        subfze  r.6,r.6                 // negate high part of multiplicand
        crset   4*cr.7+0                // set LT bit of cr.7
multiplicand_adjusted:
        cmpwi   r.7,0                   // test sign of multiplier
        bge     multiplier_adjusted     // if nonnegative, proceed
        neg     r.7,r.7                 // negate multiplier
        crnot   4*cr.7+0,4*cr.7+0       // invert LT bit of cr.7
multiplier_adjusted:
        mulhwu  r.9,r.5,r.7             // high(B*C)
        mullw   r.10,r.6,r.7            // low(A*C)
        mulhwu  r.11,r.6,r.7            // high(A*C)
        mullw   r.8,r.5,r.7             // P3 = low(B*C)
        addc    r.9,r.9,r.10            // P2 = high(B*C)+low(A*C)
        addze   r.11,r.11               // P1 = high(A*C)+[carry out of P2]
        cmpwi   r.11,0                  // check for overflow
        bne     mull_over
        bnl     cr.7,product_adjusted   // was exactly one operand negated?
        subfic  r.8,r.8,0               // negate low part of product
        subfze  r.9,r.9                 // negate high part of product
product_adjusted:
        stw     r.8,0(r.3)              // store low word of product
        stw     r.9,4(r.3)              // store high word of product
        blr
mull_over:
        twi     0x1b,r.11,0             // Trap on overflow

        LEAF_EXIT(RtlExtendedIntegerMultiply)


//++
//
// LARGE_INTEGER
// RtlLargeIntegerNegate (
//    IN LARGE_INTEGER Subtrahend
//    )
//
// Routine Description:
//
//    This function negates a signed large integer and returns the signed
//    large integer result.
//
// Arguments:
//
//    Subtrahend (r.5, r.6) - Supplies the subtrahend value.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlLargeIntegerNegate)

        subfic  r.5,r.5,0               // double precision subtract from 0
        subfze  r.6,r.6
        stw     r.5,0(r.3)              // store low part of result
        stw     r.6,4(r.3)              // store high part of result
        LEAF_EXIT(RtlLargeIntegerNegate) // return


//++
//
// LARGE_INTEGER
// RtlLargeIntegerSubtract (
//    IN LARGE_INTEGER Minuend,
//    IN LARGE_INTEGER Subtrahend
//    )
//
// Routine Description:
//
//    This function subtracts a signed large integer from a signed large
//    integer and returns the signed large integer result.
//
// Arguments:
//
//    Minuend (r.5, r.6) - Supplies the minuend value.
//
//    Subtrahend (r.7, r.8) - Supplies the subtrahend value.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlLargeIntegerSubtract)

        subfc   r.5,r.7,r.5             // double precision subtract
        subfe   r.6,r.8,r.6
        stw     r.5,0(r.3)              // store low part of result
        stw     r.6,4(r.3)              // store high part of result
        LEAF_EXIT(RtlLargeIntegerSubtract) // return


//++
//
// LARGE_INTEGER
// RtlLargeIntegerShiftLeft (
//    IN LARGE_INTEGER LargeInteger,
//    IN CCHAR ShiftCount
//    )
//
// Routine Description:
//
//    This function shifts a signed large integer left by an unsigned
//    integer modulo 64 and returns the shifted signed large integer
//    result.
//
//    N.B. No test is made for significant bits shifted out of the result.
//
// Arguments:
//
//    LargeInteger (r.5, r.6) - Supplies the large integer to be shifted.
//
//    ShiftCount (r.7) - Supplies the left shift count.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlLargeIntegerShiftLeft)

        andi.   r.7,r.7,0x3f            // mod 64 of shift count
        subfic  r.8,r.7,32
        slw     r.6,r.6,r.7
        srw     r.0,r.5,r.8
        or      r.6,r.6,r.0
        addic   r.8,r.7,-32
        slw     r.0,r.5,r.8
        or      r.6,r.6,r.0
        slw     r.5,r.5,r.7
        stw     r.5,0(r.3)              // store low result
        stw     r.6,4(r.3)              // store high result
        LEAF_EXIT(RtlLargeIntegerShiftLeft)


//++
//
// LARGE_INTEGER
// RtlLargeIntegerShiftRight (
//    IN LARGE_INTEGER LargeInteger,
//    IN CCHAR ShiftCount
//    )
//
// Routine Description:
//
//    This function shifts an unsigned large integer right by an unsigned
//    integer modulo 64 and returns the shifted unsigned large integer
//    result.
//
// Arguments:
//
//    LargeInteger (r.5, r.6) - Supplies the large integer to be shifted.
//
//    ShiftCount (r.7) - Supplies the right shift count.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlLargeIntegerShiftRight)

        andi.   r.7,r.7,0x3f            // mod 64 of shift count

// The sequence below depends on the fact that shift amounts are interpreted
// mod 64.  In particular, a shift amount of -N bits, where 0 < N <= 32,
// specifies a shift of 64-N bits, where 32 <= 64-N < 64, and a shift of 32 or
// more bits always yields zero.  Let s = (r.7) be the amount to shift right.
// The sequence below behaves differently when s<32, when s=32, and when s>32.
// When s<32, we view the 64-bit value to be shifted as follows:
//
//
//    |             r.6              |             r.5              |
//    +-----------------+------------+-----------------+------------+
//    |        T        |     U      |        V        |     W      |
//    +-----------------+------------+-----------------+------------+
//    |<- (32-s) bits ->|<- s bits ->|<- (32-s) bits ->|<- s bits ->|
//
// When s >= 32, we view the 64-bit value to be shifted as follows:
//
//    |             r.6              |             r.5              |
//    +-----------------+------------+------------------------------+
//    |        X        |     Y      |              Z               |
//    +-----------------+------------+------------------------------+
//    |                 |<- (s-32) ->|<------------ 32 ------------>|
//    |<- (64-s) bits ->|<--------------- s bits ------------------>|
//
                                // When s < 32: | When s = 32: | When s > 32:
                                // -------------+--------------+-------------
        subfic  r.8,r.7,32      // 32-s (> 0)   |   0          | 32-s (< 0)
        addi    r.9,r.7,-32     // s-32 (< 0)   |   0          | s-32 (> 0)
        srw     r.5,r.5,r.7     // 0...0 V      |   0          | 0
        slw     r.0,r.6,r.8     // U 0...0      |   X (= X Y)  | 0
        or      r.5,r.5,r.0     // U V          |   X          | 0
        srw     r.0,r.6,r.9     // 0            |   X          | 0...0 X
        or      r.5,r.5,r.0     // U V          |   X          | 0...0 X
        srw     r.6,r.6,r.7     // 0...0 T      |   0          | 0

        stw     r.5,0(r.3)      // store low result
        stw     r.6,4(r.3)      // store high result
        LEAF_EXIT(RtlLargeIntegerShiftRight)


//++
//
// LARGE_INTEGER
// RtlLargeIntegerArithmeticShift (
//    IN LARGE_INTEGER LargeInteger,
//    IN CCHAR ShiftCount
//    )
//
// Routine Description:
//
//    This function shifts a signed large integer right by an unsigned
//    integer modulo 64 and returns the shifted signed large integer
//    result.
//
// Arguments:
//
//    LargeInteger (r.5, r.6) - Supplies the large integer to be shifted.
//
//    ShiftCount (r.7) - Supplies the right shift count.
//
// Return Value:
//
//    The large integer result is stored at the address supplied by r.3.
//
//--

        LEAF_ENTRY(RtlLargeIntegerArithmeticShift)

        andi.   r.7,r.7,0x3f            // mod 64 of shift count

// The sequence below depends on the fact that shift amounts are interpreted
// mod 64.  In particular, a shift amount of -N bits, where 0 < N <= 32,
// specifies a shift of 64-N bits, where 32 <= 64-N < 64, and an arithmetic
// shift of 32 or more bits always yields 32 copies of the original sign bit.
// Let s = (r.7) be the amount to shift right.  The sequence below behaves
// differently when s<=32 and when s>32.  When s<=32, we view the 64-bit value
// to be shifted as follows:
//
//
//    |             r.6              |             r.5              |
//    +-----------------+------------+-----------------+------------+
//    |        T        |     U      |        V        |     W      |
//    +-----------------+------------+-----------------+------------+
//    |<- (32-s) bits ->|<- s bits ->|<- (32-s) bits ->|<- s bits ->|
//
// When s > 32, we view the 64-bit value to be shifted as follows:
//
//    |             r.6              |             r.5              |
//    +-----------------+------------+------------------------------+
//    |        X        |     Y      |              Z               |
//    +-----------------+------------+------------------------------+
//    |                 |<- (s-32) ->|<------------ 32 ------------>|
//    |<- (64-s) bits ->|<--------------- s bits ------------------>|
//

                              // When s <= 32:         | When s > 32:
                              // ----------------------+----------------------
        subfic  r.8,r.7,32    // 32-s (>= 0)           | 32-s (< 0)
        srw     r.5,r.5,r.7   // 0...0 V               | 0
        slw     r.0,r.6,r.8   // U 0...0               | 0
        or      r.5,r.5,r.0   // U V                   | 0
        addic.  r.9,r.7,-32   // s-32 (<= 0)           | s-32 (> 0)
        sraw    r.10,r.6,r.9  // sign(TU)...sign(TU)   | sign(XY)...sign(XY) X
        ble     more_than_32  // (CR0 set by addic. two instructions earlier.)
        mr      r.5,r.10      // [instruction skipped] | sign(XY)...sign(XY) X
more_than_32:                 //                       |
        sraw    r.6,r.6,r.7   // sign(T)...sign(T) T   | sign(XY)...sign(XY)

        stw     r.5,0(r.3)    // store low result
        stw     r.6,4(r.3)    // store high result
        LEAF_EXIT(RtlLargeIntegerArithmeticShift)