# Copyright 1992, Digital Equipment Corporation
#
# This software is furnished under a license and may be used and copied
# only in accordance with the terms of such license and with the
# inclusion of the above copyright notice. This software or any other
# copies thereof may not be provided or otherwise made available to any
# other person. No title to and ownership of the software is hereby
# transferred.
#
# The information in this software is subject to change without notice
# and should not be construed as a commitment by Digital Equipment
# Corporation.
#
# Digital assumes no responsibility for the use or reliability of its
# software on equipment which is not supplied by Digital.
#
# 008 17 Jun 1992 KDG/wbn Most of initial tailored version. (See
# commentary below.)
#
# 009 4 Jul 1992 KDG Continue work on initial tailored version,
# including bugfixes and mod entry points
#
# 010 15 Jul 1992 KDG - Final touches for V1 (other than any bugfixes)
# - .aligns commented out to allow older assembler versions
#
# 011 16 Jul 1992 KDG - Bugfix for ots_div_l for -maxint dividend
# - OSF-only source changes for BL7
#
# 012 10 Aug 1992 KDG Fix overflow division entry points
#
# 013 23 Sep 1992 KDG Add case-sensitive entry names
#
# 014 4 Jan 1993 KDG Tweak for OSF assembler
#
# 015 26 Jan 1993 KDG Add underscore prefix, OSF uses CS names
#
# 016 5 Apr 1993 WBN Speed up core 64-bit, shrink table entry to 2 QWs
#++
# Entry points defined in this module:
#
# -- 32 bit division/remainder support
# unsigned ots_rem_ui(unsigned dividend, unsigned divisor)
# unsigned ots_div_ui(unsigned dividend, unsigned divisor)
# int ots_mod_i(int dividend, int modulus)
# int ots_rem_i(int dividend, int divisor)
# int ots_div_i_o(int dividend, int divisor)
# int ots_div_i(int dividend, int divisor) | "hot spot"
# {core routine - div32} |
#
# -- 64 bit division support
# {core routine - div64} | (uses div32 for 32b cases)
# long ots_div_l_o(long dividend, long divisor)
# long ots_div_l(long dividend, long divisor) | "hot spot"
# long ots_rem_l(long dividend, long divisor)
# long ots_mod_l(long dividend, long modulus)
# unsigned long ots_div_ul(unsigned long dividend, unsigned long divisor)
# unsigned long ots_rem_ul(unsigned long dividend, unsigned long divisor)
#
# Special conventions: No stack space, r0-r1, r16-r19 and r26-r28 ONLY.
# (Warning: The auto-loader potentially takes some regs across
# the call if this is being used in a shared lib. environment.)
#
# NOTE: This set of routines may start using stack space at some
# future point in time.
#
# -- Possible future entry points include:
# (These all return results in r0/r1)
# {int quotient, int remainder} ots_div_mod_i(int dividend, int divisor)
# {int quotient, int remainder} ots_div_mod_i_o(int dividend, int divisor)
# {int quotient, int remainder} ots_div_rem_i(int dividend, int divisor)
# {int quotient, int remainder} ots_div_rem_i_o(int dividend, int divisor)
# {unsigned quotient, unsigned remainder} ots_div_rem_ui(unsigned dividend, unsigned divisor)
#
# {long quotient, long remainder} ots_div_mod_i(long dividend, long divisor)
# {long quotient, long remainder} ots_div_mod_i_o(long dividend, long divisor)
# {long quotient, long remainder} ots_div_rem_i(long dividend, long divisor)
# {long quotient, long remainder} ots_div_rem_i_o(long dividend, long divisor)
# {unsigned long quotient, unsigned long remainder}
# ots_div_rem_ui(unsigned long dividend, unsigned long divisor)
#
#
# General commentary:
#
# This is an attempt at a fairly high performance version using relatively
# straightforward algorithms. Note that the code is intended to be scheduled
# well for EV4, but still reasonably for LCA/EV5.
#
# Also, note that there was only so much time available for this, so it
# is far from "perfect". "Better is the enemy of done"...
#
# Possible future areas of improvement (and unfinished business):
#
# - Another possible way of doing things for the "slow" (divnn cases)
# is to use an approximate inverse and convergence. Given the speed
# of the multiplier on EV4, and given "time to market", this wasn't
# done for V1.) I have some mail with the algorithm from Bob Gries
# (through Scott Robinson).
#
# - When the divisor is too large for the table, but has n low-order zero
# bits, see if divisor/2^n fits in the table, and use that entry with
# dividend/2^n
#
# - Use UMULH for the 'mod' routines.
#
# This version can do a table lookup division (divisors with <=tablesize)
# in roughly 32 cycles on an EV4 (with cache hits for all loads), of which
# 21 are for the umulh. There is a strong bias toward the table-lookup case.
# Note that for many cases, the umulh is the last thing before the return,
# so the multiply can occur in parallel with the procedure return.
# (It is interesting that the R3000 hardware divide instruction takes 33
# cycles and the R4000 takes 76(!) ...) Small powers of 2 are retired in
# roughly 20 cycles. Larger divisors take considerably longer at this point.
#
#include "ots_defs.hs"
#ifdef OSF
# to get the PAL_gentrap literal
#include <alpha/pal.h>
#endif
# Data area description
#
# The data area "ots_div_data" is an array of structures, indexed
# by the divisor value, with each array entry being 16 bytes in size
# formatted as follows:
#
# 6
# 3 6 0
# +-------+-------+-------+-------+-------+-------+-------+-------+
# | 32 bit reciprocal (58 bits) |shift|
# +-------+-------+-------+-------+-------+-------+-------+-------+
# | 64 bit reciprocal |
# +---------------------------------------------------------------+
#
# The 64-bit reciprocal has the leading '1' bit omitted, so it provides
# 65 bits of precision -- enough to handle unsigned 64-bit values.
#
# The first longword contains the 6-bit shift amount needed to handle
# 64-bit cases and powers of two.
#
# The 32-bit reciprocal has the shift count built in, so a UMULH gives
# the correct quotient without shifting. The reciprocal needs 33 bits
# of precision. The 6-bit shift amount is noise in the reciprocal that
# can be ignored.
#
# (Oh, you want proof?) For divisors up to 2^k, we store k-1 leading
# zero bits, 33 bits of fraction, (25-k) more bits of fraction, and
# 6 bits of noise. The standard method would round at the 33rd fraction
# bit. We need to ensure that the value actually stored is geq the
# infinite reciprocal, but leq the standard value. For divisors up to
# 2^k, there will be a zero bit somewhere in the k bits below the 33rd,
# so as long as we round below the (33+k)th bit, the rounded value
# plus any noise is still less than the standard value. This requires
# k < 12.
#
# The actual data is declared in ots_div_data_alpha.
#
# Offsets to the various fields in the data structure
#
#define shift_o 0
#define recip64_o 8
#define recip32_o 0
#
# Note that the shift/add ops used to compute the table entries
# "know" that the table size is 16. (i.e. addq -> s8addq -> ldq)
# By changing the first instruction, it's fairly easy to change the
# table entry size to 24, 32, or 40 bytes (using s4add/sub), or
# 56/64/72 bytes using s8add/sub, should that be desirable.
# Maximum divisor present in the table
#
#define table_max 512
# Division by zero gentrap code
#
#define GEN_INTDIV -2
# Address of division data area (shared by all entry points)
#
#ifdef VMS
.psect ots_link
ots_div_addr:
.address ots_div_data
.psect ots_code
#endif
# Dummy entry point for the module
#
.globl _OtsDivide
.ent _OtsDivide
_OtsDivide:
.set noat
.set noreorder
#ifdef OSF
.frame sp, 0, r26
#endif
#ifdef WNT
.frame sp, 0, r26
#endif
# unsigned ots_rem_ui(unsigned dividend, unsigned divisor)
# unsigned 32 bit remainder support
#
#.align 4
.globl _OtsRemainder32Unsigned
.aent _OtsRemainder32Unsigned
_OtsRemainder32Unsigned:
#ifdef VMS
ldq r27, <ots_div_addr-ots_rem_ui>(r27)# start loading address of division data area
#endif
#ifdef OSF
ldgp gp, 0(r27) # load the global pointer
.frame sp, 0, r26
lda r27, _OtsDivData # start loading address of the division data area
#endif
#ifdef WNT
.frame sp, 0, r26
lda r27, _OtsDivData # load the division data table address
#endif
lda r28, -table_max(r17) # test for table lookup
subl r17, 1, r1 # first part of power-of-2 check
blt r17, rui_big # big divisors can (must) be handled by a simple comparison
and r17, r1, r18 # second part of power-of-2 check
bgt r28, rui_lrgdiv # branch if large divisor
addq r17, r17, r0 # compute divisor*2 for table lookup
beq r18, rui_pwr2 # if zero, divisor is a power of 2
s8addq r0, r27, r27 # finish computing table entry addr (table addr+divisor*16)
ldq r1, recip32_o(r27) # load approximate reciprocal
cmpult r16, r17, r18 # is the dividend < divisor?
zap r16, 0xF0, r0 # kill the propagated sign bit
bne r18, rui_lss # if dividend < divisor, fast exit
umulh r0, r1, r0 # multiplication for division step
mull r0, r17, r0 # multiply back to get value to subtract
subl r16, r0, r0
ret r31, (r26) # and return
rui_pwr2:
beq r17, divzer # check for 0
and r16, r1, r0 # use x-1 to mask
ret r31, (r26)
rui_lss:
mov r16, r0
ret r31, (r26)
rui_lrgdiv:
zap r16, 0xf0, r16 # zero-extend the dividend
bsr r28, div32 # use the core routine getting the remainder in r1
sextl r1, r0
ret r31, (r26)
# divisors with the sign bit set. two possible results,
# dividend if dividend < divisor, or dividend-divisor otherwise
rui_big:
cmpult r16, r17, r1
subl r16, r17, r0
cmovne r1, r16, r0
ret r31, (r26)
# unsigned ots_div_ui(unsigned dividend, unsigned divisor)
# unsigned 32 bit division support
#
#.align 4
.globl _OtsDivide32Unsigned
.aent _OtsDivide32Unsigned
_OtsDivide32Unsigned:
#ifdef VMS
ldq r27, <ots_div_addr-ots_div_ui>(r27)# start loading address of division data area
#endif
#ifdef OSF
ldgp gp, 0(r27) # load the global pointer
.frame sp, 0, r26
lda r27, _OtsDivData # start loading address of the division data area
#endif
#ifdef WNT
.frame sp, 0, r26
lda r27, _OtsDivData # load the division data table address
#endif
lda r28, -table_max(r17) # test for table lookup
blt r17, dui_big # big divisors can (must) be handled by a simple comparison
addq r17, r17, r18 # compute divisor*2
cmpule r17, r16, r0 # is the dividend < divisor?
beq r17, divzer # check for 0
s8addq r18, r27, r27 # finish computing table entry addr (table addr+divisor*16)
beq r0, dui_end # fast out for divisor > dividend
bgt r28, dui_lrgdiv # branch if large divisor
ldq r1, recip32_o(r27) # load approximate reciprocal
zap r16, 0xF0, r16 # kill the propagated sign bit
blt r1, dui_smpwr2 # go handle powers of 2 specially
umulh r16, r1, r0 # start multiplication for division step
dui_end:ret r31, (r26) # and return
nop
dui_smpwr2:
srl r16, r1, r0 # shift the result into place
sextl r0, r0 # reinsert sign if dividing by 1
ret r31, (r26) #
dui_lrgdiv:
zap r16, 0xf0, r16 # zero-extend the dividend
bsr r28, div32 # use the core routine getting the remainder in r1
sextl r0, r0 # make sure the result is in normal form for uint32
ret r31, (r26)
# divisor with the sign bit set. two possible results,
# 1 if divisor <= dividend, or 0 otherwise
dui_big:
cmpule r17, r16, r0
ret r31, (r26)
# int ots_mod_i(int dividend, int modulus)
# signed 32 bit modulus support
#
# This entry could be MUCH more optimized. It doesn't even try to use
# UMULH division currently... (A casualty of time-to-market.)
# Note that mod is only used by Ada and PL/I.
#
#.align 4
.globl _OtsModulus32
.aent _OtsModulus32
_OtsModulus32:
negq r17, r18 # first part of abs(divisor)
cmovge r17, r17, r18 # second part of abs(divisor)
subq r18, 1, r1 # start checking for power of 2
beq r17, divzer # check for 0
and r18, r1, r0 # second part of power-of-2 check
beq r0, mi_p2 # for powers of two, simply do a mask
# (note that the power-of-2 case MUST be used to handle
# the -maxint case due to the way the fix-up info is
# saved across the core routine call)
xor r16, r17, r28 # get xor of signs
clr r19 # don't need a bias if dividend and divisor have same sign
cmovlt r28, r17, r19 # bias is original divisor for different sign case
and r16, r17, r27 # if both dividend & divisor were neg. need to negate result
mov r18, r17 # move abs(divisor) into r17
negq r16, r18 # first part of abs(dividend)
cmovlt r16, r18, r16 # second part of abs(dividend)
cmplt r27, r31, r0 # get 1 if both operands were <0
sll r0, 63, r0 # get bit as the high bit
bis r0, r19, r19 # and MERGE with bias (0 -> no fixup, -maxint -> negate result,
# divisor > 0 - subtract remainder if non-zero, divisor < 0 -
# add remainder if non-zero)
bsr r28, div32 # use the core routine getting the remainder in r1
cmoveq r1, r31, r19 # don't do any fix-up if the remainder was zero
addq r19, r19, r18 # check to see if this is the negative/negative case, which just gets a negated remainder
subq r19, 1, r28 # wrap -maxint to positive
negl r1, r0 # move negated value, may abort later
cmovlt r28, r1, r0 # if both positive, or negative divisor, keep positive remainder
cmoveq r18, r31, r19 # now that negation is done, treat -maxint case as 0
addl r19, r0, r0 # add any bias (original divisor or 0)
ret r31, (r26) # and return
mi_p2: cmovge r17, r31, r17 # no bias if divisor was >= 0
and r16, r1, r1 # use the divisor-1 mask that's already in r1
cmoveq r1, r31, r17 # use zero if result was zero
addl r17, r1, r0 # do any biasing, and ensure the result is sign ext
ret r31, (r26) # and return
# int ots_rem_i(int dividend, int divisor)
# signed 32 bit remainder support
#
#.align 4
.globl _OtsRemainder32
.aent _OtsRemainder32
_OtsRemainder32:
#ifdef VMS
ldq r27, <ots_div_addr-ots_rem_i>(r27)# start loading address of division data area
#endif
#ifdef OSF
ldgp gp, 0(r27) # load the global pointer
.frame sp, 0, r26
lda r27, _OtsDivData # start loading address of the division data area
#endif
#ifdef WNT
.frame sp, 0, r26
lda r27, _OtsDivData # load the division data table address
#endif
negq r17, r18 # first part of abs(divisor)
cmovlt r17, r18, r17 # second part of abs(divisor)
subq r17, 1, r1 # start checking for power of 2
and r17, r1, r0 # finish check for power of 2
sra r16, 63, r19 # get -1/0 if dividend was negative
negq r16, r18 # first part of abs(dividend)
cmovlt r16, r18, r16 # second part of abs(dividend)
beq r0, ri_pwr2 # for powers of two, simply do a mask (not power of 2 include 0 and 80000000)
lda r28, -table_max(r17) # test for table lookup
bgt r28, ri_lrgdiv # branch if large divisor
addq r17, r17, r0 # compute divisor*2 for table lookup
s8addq r0, r27, r27 # finish computing table entry addr (table addr+divisor*16)
ldq r1, recip32_o(r27) # load approximate reciprocal
umulh r16, r1, r0 # multiplication for division step
mull r0, r17, r0 # multiply back to get value to subtract
subl r16, r0, r0 # get abs of final result
xor r0, r19, r0 # start compliment if original dividend was <0
subl r0, r19, r0 # finish compliement
ret r31, (r26) # and return
# Handle powers of 2, including 0 and 80000000
ri_pwr2:
and r16, r1, r0 # use the divisor-1 mask in r1
beq r17, divzer # division by zero
xor r0, r19, r0 # start compliment if original dividend was <0
subl r0, r19, r0 # finish compliement
ret r31, (r26)
nop
ri_lrgdiv:
bsr r28, div32 # use the core routine getting the remainder in r1
xor r1, r19, r0 # start compliment if original dividend was <0
subl r0, r19, r0 # finish compliement
ret r31, (r26)
# int ots_div_i_o(int dividend, int divisor)
# signed 32 bit division support, overflow detection
#
#.align 4
.globl _OtsDivide32Overflow
.aent _OtsDivide32Overflow
_OtsDivide32Overflow:
#ifdef VMS
ldq r27, <ots_div_addr-ots_div_i_o>(r27)# start loading address of division data area
#endif
#ifdef OSF
ldgp gp, 0(r27) # load the global pointer
.frame sp, 0, r26
lda r27, _OtsDivData # start loading address of the division data area
#endif
#ifdef WNT
.frame sp, 0, r26
lda r27, _OtsDivData # load the division data table address
#endif
not r17, r1 # is the divisor -1?
bne r1, di_skip # continue if not
neglv r16, r0 # quotient = -dividend, overflow on ^x800000000
ret r31, (r26)
# int ots_div_i(int dividend, int divisor)
# signed 32 bit division support, no overflow detection
#
nop #.align 4
.globl _OtsDivide32
.aent _OtsDivide32
_OtsDivide32:
#ifdef VMS
ldq r27, <ots_div_addr-ots_div_i>(r27)# start loading address of division data area
#endif
#ifdef OSF
ldgp gp, 0(r27) # load the global pointer
.frame sp, 0, r26
lda r27, _OtsDivData # start loading address of the division data area
#endif
#ifdef WNT
.frame sp, 0, r26
lda r27, _OtsDivData # load the division data table address
#endif
di_skip:
di_retry:
lda r28, -table_max(r17) # test for table lookup
ble r17, di_notpos # not a positive divisor case
addq r17, r17, r0 # compute divisor*2
negq r16, r18 # part 1 of abs(dividend) -> r18. (Note 0xffffffff 80000000 => 0x00000000 80000000)
bgt r28, di_lrgdiv # branch if large divisor
s8addq r0, r27, r27 # finish computing table entry addr (table addr+divisor*16)
cmpule r17, r18, r0 # divisor <= dividend?
cmovge r16, r16, r18 # part 2 abs. val of the dividend -> r18
beq r0, di_end # if not, result is zero
ldq r1, recip32_o(r27) # load approximate reciprocal
blt r1, di_smpwr2 # go handle powers of 2 specially
umulh r18, r1, r0 # start multiplication
blt r16, di_negres # negate result? (done as a branch to allow umulh to "hang out" over end for common case)
di_end: ret r31, (r26) # return for same-sign (common) case
di_negres:
negl r0, r0 # different signs - compliment result
ret r31, (r26) # return for different-sign (uncommon) case
di_smpwr2:
srl r18, r1, r18 # shift the result into place
sra r16, 63, r16 # get 0/-1 based on sign of dividend
xor r18, r16, r18 # conditionally compliment
subl r18, r16, r0 # and increment for the final value
ret r31, (r26) # (note subl is required for sign ext for %x80000000/1 case)
# Zero or negative divisor case. If just a negative divisor,
# compliment both dividend and divisor and do things again.
di_notpos:
beq r17, divzer # division by zero
negl r17, r17 # |divisor|, note that 0x80000000 still appears negative
negq r16, r16 # compliment dividend (negq so that 0xffffffff 80000000 => 0x00000000 80000000
bgt r17, di_retry # dispatch back for normal case (not 0x80000000 or 0)
sextl r16, r16 #
cmpeq r16, r17, r0 # -maxint/-maxint = 1, all others 0
ret r31, (r26) # done
# Large divisor for signed 32/32 case
#
nop #.align 3
di_lrgdiv:
sra r16, 63, r19 # get 0/-1 based on sign of dividend
cmovlt r16, r18, r16 #
bsr r28, div32 # go use core routine
xor r0, r19, r0 # conditionally compliment
subl r0, r19, r0 # and increment for the final value (subl ensures normalized result)
ret r31, (r26) # done
# Large divisor case core routine for 32b
# (wbn)
#
# r0 - quotient (output)
# r1 - remainder (output)
# r16 - dividend (range 0..2^32-1, zero extended)
# r17 - divisor (range 1..2^31-1 - overwritten)
# r18 - scratch
# r19 - not used (one temp for 'caller')
# r26 - not used (expected to contain main return address)
# [r27 - scratch] (not currently written)
# r28 - this "subroutine" return address
#
# Some tightened up bit-at-a-time code for dividing 32-bit integers.
# It uses two tricks: keep the running remainder and the quotient in
# the same 64-bit register (MQ?), and add 1 while subtracting the divisor,
# so that a single CMOV sets both the new remainder and the new quotient.
# I start off by trying to skip 8 bits at a time; should this skip a
# smaller amount, so the main loop iterates less often? If the divisor
# is already known to be large enough, the last case in this test is never
# used...
#
# This code expects as input two integers in the range 0 <= x < 2^31
# (that is, it doesn't work for general unsigned longwords, and doesn't
# include sign manipulation.)
#
# The code here takes about 34n+11 cycles for a quotient occupying n bytes.
#
# Inputs: dividend in r16, divisor in r17
# Outputs: quotient in r0, remainder in r1
# Destroys: [r16,]r17,r18,[r27]
#
# How many quotient bytes will there be: 0, 1, 2, 3, 4?
#
#.align 4
div32: cmpule r17, r16, r0 # Divisor leq dividend?
sll r17, 32, r18 # Position divisor for loop
sll r17, 8, r1 # Prepare for next compare
beq r0, d32end # Dividend less, quotient is zero.
ediv32: mov 8-3, r17 # Hope to skip 3 bytes of loop
cmpule r1, r16, r0 # Shifted divisor still leq dividend?
sll r1, 8, r1 # Prepare for next compare
beq r0, d32ent # Go loop over just one byte
mov 8-2, r17 # Hope to skip 2 bytes of loop
cmpule r1, r16, r0 # Shifted divisor still leq dividend?
sll r1, 8, r1 # Prepare for next compare
beq r0, d32ent # Go loop over just two bytes
mov 8-1, r17 # Hope to skip 1 byte of loop
cmpule r1, r16, r0 # Shifted divisor still leq dividend?
nop # stall - align d32ent and d32loop
cmovne r0, 8, r17 # If we can't skip any bytes
# start loop generating quotient bits. NOTE: The loop setup requires
# an even number of iterations.
#
d32ent: extqh r16, r17, r0 # Shift dividend left for skipped bytes
subq r18, 1, r1 # Divisor in high LW - 1 in low LW
s8subq r17, 34, r17 # Convert bytes to bits and adjust
addq r0, r0, r0 # Shift left to start first iteration
d32loop:subq r0, r1, r18 # Can we subtract divisor from it?
cmovge r18, r18, r0 # If so, set new remainder & quotient
# stall
addq r0, r0, r0 # Shift remainder and quotient left
subq r0, r1, r18 # Can we subtract divisor from it?
cmovge r18, r18, r0 # If so, set new remainder & quotient
subq r17, 2, r17 # Loop counter
addq r0, r0, r0 # Shift remainder and quotient left
bgt r17, d32loop # Repeat
subq r0, r1, r18 # Can we subtract divisor from it?
cmovge r18, r18, r0 # If so, set new remainder & quotient
# stall
addq r0, r0, r0 # Shift remainder and quotient left
subq r0, r1, r18 # Finish last iteration
cmovge r18, r18, r0
# stall
srl r0, 32, r1 # Get remainder in r1
zap r0, 0xf0, r0 # Keep only quotient in r0
nop # for alignment
d32end: cmoveq r0, r16, r1 # Move remainder to r1 for quotient=0 case
ret r31, (r28) # Not a real software procedure return
# Large divisor case core routine for 64b
#
# r0 - quotient (output)
# r1 - remainder (output)
# r16 - dividend (range 0..2^64-1 - overwritten)
# r17 - divisor (range 1..2^63-1 - overwritten)
# r18 - scratch
# r19 - not used (one temp for 'caller')
# r26 - not used (expected to contain main return address)
# r27 - points to table of inverses (overwritten)
# r28 - this "subroutine" return address
#
# Inputs: dividend in r16, divisor in r17
# Outputs: quotient in r0, remainder in r1
# Destroys: r16,r17,r18,r27
#
# Note- this routine could save a few cycles if we could use
# another scratch register -- perhaps by pushing one on the stack?
#
#.align 4
div64: sll r17, 32, r18 # Position for ediv32
cmpule r17, r16, r0 # Is divisor leq dividend?
srl r17, 31, r1 # Is divisor geq 2^31?
beq r0, d64end # If divisor > dividend, quotient=0
cmpule r18, r16, r0 # Is divisor*2^32 leq dividend?
sll r17, 8, r1 # Position for ediv32 checking
or r1, r0, r0 # 0 if divisor & quotient fit in 32 bits
beq r0, ediv32 # Use 32-bit routine if OK
# Full 64-bit divide needed. Use the table of shift amounts to compute
# the number of leading zero bits in the divisor. Find the leftmost
# nonzero byte, then the leftmost nonzero bit in that byte. Table entry
# #n+1 contains the number of bits needed to hold n (1..8). We know the
# divisor is nonzero here.
#
cmpbge r31, r17, r0 # Get a zero bit for each nonzero byte
#stall
sll r0, 4, r0 # *16 bytes per table entry
#stall
subq r27, r0, r0 # table base plus complement...
#stall
ldq r1, 256*16(r0) # get position of first nonzero
#2 stalls
subq r1, 1, r1 # byte number of first nonzero
extbl r17, r1, r0 # get first nonzero byte
#stall
addq r0, r0, r0 # *2
s8addq r0, r27, r0 # *16 bytes per table entry
#stall
ldq r0, 16(r0) # bit number of first nonzero
negq r1, r1 # 1 + #leading zero bytes (mod 8)
#stall
s8subq r1, r0, r0 # number of leading zero bits
and r0, 0x3F, r0 # discard other junk
# The following code does a similar normalize calculation without the table.
#===
# extll r17, #4, r18 ; Normalize the divisor and
# mov #63, r0 ; count leading zeros
# cmovne r18, #31, r0
# cmoveq r18, r17, r18
# ;stall
# extwl r18, #2, r1
# ;stall
# cmovne r1, r1, r18
# cmovne r1, #16, r1
# ;stall
# subq r0, r1, r0
# extbl r18, #1, r1
# ;stall
# cmovne r1, r1, r18
# cmovne r1, #8, r1
# ;stall
# subq r0, r1, r0
# andnot r18, #^x0f, r1
# cmovne r1, r1, r18
# cmovne r1, #4, r1
# ;stall
# subq r0, r1, r0
# andnot r18, #^x33, r1
# cmovne r1, r1, r18
# cmovne r1, #2, r1
# ;stall
# subq r0, r1, r0
# andnot r18, #^x55, r1
# cmovne r1, #1, r1
# ;stall
# subq r0, r1, r0
#===
# R0 contains number of leading zero bits in the divisor.
sll r17, r0, r17 # Normalize: MSB is set.
# Now break divisor into pieces a+x, where a is the leading
# 9 bits, rounded, and x is the rest. Use a linear
# approximation for 1/divisor = 1/a - x/a^2 [+ x^2/a^3 -...]
#
srl r17, 64-10, r1 # Keep 10 bits of divisor
#stall
addq r1, 1, r1 # Round to form 'a'
andnot r1, 1, r1
s8addq r1, r27, r27 # Index table of 1/a and 1/a^2
sll r1, 64-10, r1 # shift 'a' to match divisor
ldq r18, (r27) # Load QW containing 1/a^2
subq r1, r17, r1 # -x = a - divisor
beq r1, d64_easy # Use table directly if x=0
inswl r18, 6, r18 # position 1/a^2
blt r1, d64_sub # correct for sign of -x
umulh r1, r18, r1 # -x/a^2
ldq r27, 8(r27) # Load QW containing 1/a - 1
br r31, d64_cont
d64_sub:umulh r1, r18, r1 # -x/a^2
ldq r27, 8(r27) # load QW containing 1/a - 1
# 2 stalls
s4addq r18, 0, r18
subq r27, r18, r27 # correct for sign of -x
d64_cont:
# many stalls
s4addq r1, r27, r18 # 1/divisor approx= 1/a - x/a^2
# Now one or two Newton iterations to get 24 or 56 good bits of the inverse.
# Each computes inv = inv * (2 - inv*divisor). We could skip out early
# here or above if the dividend and/or quotient is small enough for the
# amount of precision we've developed...
#
# We handle quadwords with the radix point on the left. The divisor has
# been normalized to the range 0.5 < divisor < 1.0; the inverses are in
# the range 1.0 < inverse < 2.0, and are represented without the leading 1.
#
umulh r18, r17, r1 # (inv0 - 1) * divisor
# many stalls
addq r1, r17, r1 # add hidden bit * divisor
negq r1, r1 # 2 - inv0*divisor, very near 1.0
umulh r18, r1, r27 # (inv0 - 1) * (2 - inv0*divisor)
cmovlt r1, 0, r18 # keep inv0 if (2-inv0*divisor) > 1.0
#stall
addq r18, r1, r1 # add it to hidden bit * (2-inv0*divisor)
# many stalls
addq r27, r1, r18 # inv1 = inv0 * (2 - inv0*divisor)
umulh r18, r17, r1 # (inv1 - 1) * divisor
# many stalls
addq r1, r17, r1 # add hidden bit * divisor
negq r1, r1 # 2 - inv1*divisor, very near 1.0
umulh r18, r1, r27 # (inv1 - 1) * (2 - inv1*divisor)
cmovlt r1, 0, r18 # keep inv1 if (2-inv1*divisor) > 1.0
addq r18, r1, r1 # add it to hidden bit * (2-inv1*divisor)
# many stalls
addq r27, r1, r1 # inverse = inv1 * (2 - inv1*divisor)
umulh r1, r16, r18 # dividend * (1/divisor - 1)
srl r17, r0, r17 # un-normalize divisor
negq r0, r0
subq r0, 8, r0 # how far right after first byte
# many stalls
addq r18, r16, r18 # add hidden bit * dividend
cmpult r18, r16, r1 # did it carry?
srl r18, 8, r18 # start to shift
sll r1, 56, r1 # position the carry
#stall
addq r1, r18, r1 # add the carry
srl r1, r0, r0 # final shift
mulq r17, r0, r1 # try out this quotient
# many stalls
subq r16, r1, r1 # form remainder
cmpule r17, r1, r18 # must be less than divisor
subq r1, r17, r27
cmovne r18, r27, r1 # if not, decrement remainder
addq r0, r18, r0 # and increment quotient
ret r31, (r28) # done
d64_easy:
ldq r1, 8(r27) # get 1/divisor, except hidden bit
srl r17, r0, r17 # un-normalize divisor again
blt r18, d64_pow2 # skip if power of 2
umulh r1, r16, r18 # dividend/divisor
negq r0, r0 # how far right to shift
and r16, 0x0ff, r1 # pieces of dividend
subq r0, 8, r0 # how far right after first byte
srl r16, 8, r27
# many stalls
addq r18, r1, r1 # add low piece of dividend, no carry
srl r1, 8, r1 # make room for high piece
#stall
addq r1, r27, r1 # finish adding hidden bit * dividend
srl r1, r0, r0 # final shift
mulq r17, r0, r1 # need to compute remainder too
# many stalls
subq r16, r1, r1
ret r31, (r28) # done
d64_pow2:
not r0, r0 # how far right to shift quotient
subq r17, 1, r1 # mask for remainder
srl r16, r0, r0 # shift for quotient
and r16, r1, r1 # get remainder
ret r31, (r28) # done
d64end: mov r16, r1 # Remainder to r1 for small quotient case
ret r31, (r28) # Not a real software procedure return
# long ots_div_l_o(long dividend, long divisor)
# signed 64 bit division support, overflow detection
#
#.align 4
.globl _OtsDivide64Overflow
.aent _OtsDivide64Overflow
_OtsDivide64Overflow:
#ifdef VMS
ldq r27, <ots_div_addr-ots_div_l_o>(r27)# start loading address of division data area
#endif
#ifdef OSF
ldgp gp, 0(r27) # load the global pointer
.frame sp, 0, r26
lda r27, _OtsDivData # start loading address of the division data area
#endif
#ifdef WNT
.frame sp, 0, r26
lda r27, _OtsDivData # load the division data table address
#endif
not r17, r1 # is the divisor -1?
bne r1, dl_skip # continue if not
negqv r16, r0 # q = -dividend, oflow on ^x800000000 00000000
ret r31, (r26)
nop
# long ots_div_l(long dividend, long divisor)
# signed 64 bit division support, no overflow detection
#
.globl _OtsDivide64
.aent _OtsDivide64
_OtsDivide64:
#ifdef VMS
ldq r27, <ots_div_addr-ots_div_l>(r27)# start loading address of division data area
#endif
#ifdef OSF
ldgp gp, 0(r27) # load the global pointer
.frame sp, 0, r26
lda r27, _OtsDivData # start loading address of the division data area
#endif
#ifdef WNT
.frame sp, 0, r26
lda r27, _OtsDivData # load the division data table address
#endif
dl_skip:
xor r16, r17, r19 # sign bit = result needs to be complimented (here to handle -maxint correctly)
dl_retry:
lda r28, -table_max(r17) # test for table lookup
ble r17, dl_notpos # not a positive divisor case
addq r17, r17, r0 # compute divisor*2
negq r16, r18 # part 1 of abs(dividend) -> r18
bgt r28, dl_lrgdiv # branch if large divisor
s8addq r0, r27, r27 # finish computing table entry addr (table addr+divisor*16)
srl r16, 33, r1 # can this be handled via a 32 bit case?
cmpule r17, r18, r0 # divisor <= dividend?
bne r1, dl_64bit # does this need to be a real 64 bit case?
cmovge r16, r16, r18 # part 2 abs. val of the dividend -> r18
beq r0, dl_end # if not, result is zero
ldq r27, recip32_o(r27) # load 32b approximate reciprocal
sra r19, 63, r19 # get 0/-1
blt r27, dl_smpwr2 # skip umulh for powers of 2 specially
umulh r18, r27, r0 # start multiplication
beq r19, dl_end # if compliment not required, let umulh "hang out"
negq r0, r0 # compliment case
ret r31, (r26) #
dl_64bit:
cmovge r16, r16, r18 # part 2 abs. val of the dividend -> r18
beq r0, dl_end # if not, result is zero
ldq r1, recip64_o(r27) # load approximate reciprocal
sra r19, 63, r19 # get 0/-1
ldq r27, shift_o(r27) # load shift count (low 6 bits are all that matters)
beq r1, dl_smpwr2 # skip umulh for powers of 2 specially
umulh r18, r1, r0 # start multiplication
addq r0, r18, r18 # add hidden bit
dl_smpwr2:
srl r18, r27, r18 # shift the result into place
xor r18, r19, r18 # conditionally compliment
subq r18, r19, r0 # and increment for the final value
dl_end: ret r31, (r26) #
# Zero or negative divisor case. If just a negative divisor,
# compliment both dividend and divisor and do things again.
dl_notpos:
beq r17, divzer # division by zero
negq r17, r17 # |divisor|, note that 0x80000000 00000000 still appears negative
negq r16, r16 # compliment dividend
bgt r17, dl_retry # dispatch back for normal case (not 0x80000000 00000000 or 0)
cmpeq r16, r17, r0 # -maxint/-maxint = 1, all others 0
ret r31, (r26) # done
# Large divisor for signed 64/64 case
#
dl_lrgdiv:
sra r19, 63, r19 # get 0/-1
cmovlt r16, r18, r16 #
bsr r28, div64 # go use core routine
xor r0, r19, r0 # conditionally compliment
subq r0, r19, r0 # and increment for the final value
ret r31, (r26) # done
# long ots_rem_l(long dividend, long divisor)
# signed 64 bit remainder support
#
#.align 4
.globl _OtsRemainder64
.aent _OtsRemainder64
_OtsRemainder64:
#ifdef VMS
ldq r27, <ots_div_addr-ots_rem_l>(r27)# start loading address of division data area
#endif
#ifdef OSF
ldgp gp, 0(r27) # load the global pointer
.frame sp, 0, r26
lda r27, _OtsDivData # start loading address of the division data area
#endif
#ifdef WNT
.frame sp, 0, r26
lda r27, _OtsDivData # load the division data table address
#endif
negq r17, r18 # first part of abs(divisor)
cmovlt r17, r18, r17 # second part of abs(divisor)
subq r17, 1, r1 # start checking for power of 2
and r17, r1, r0 # finish check for power of 2
sra r16, 63, r19 # get -1/0 if dividend was negative
negq r16, r18 # first part of abs(dividend)
cmovlt r16, r18, r16 # second part of abs(dividend)
beq r0, rl_pwr2 # for powers of two, simply do a mask (not power of 2 include 0 and 80000000)
lda r28, -table_max(r17) # test for table lookup
bgt r28, rl_lrgdiv # branch if large divisor
addq r17, r17, r0 # compute divisor*2 for table lookup
s8addq r0, r27, r27 # finish computing table entry addr (table addr+divisor*16)
ldq r1, recip64_o(r27) # load approximate reciprocal
ldq r18, shift_o(r27) # load shift amount
umulh r16, r1, r0 # multiplication for division step
addq r0, r16, r0 # add hidden bit
srl r0, r18, r0
mulq r0, r17, r0 # multiply back to get value to subtract
subq r16, r0, r0 # get abs of final result
xor r0, r19, r0 # start compliment if original dividend was <0
subq r0, r19, r0 # finish compliement
ret r31, (r26) # and return
# Handle powers of 2, including 0 and 80000000 00000000
rl_pwr2:
negq r16, r18 # first part of abs(dividend)
cmovlt r16, r18, r16 # second part of abs(dividend)
and r16, r1, r0 # use the divisor-1 mask in r1
beq r17, divzer # division by zero
xor r0, r19, r0 # start compliment if original dividend was <0
subq r0, r19, r0 # finish compliement
ret r31, (r26)
rl_lrgdiv:
bsr r28, div64 # use the core routine getting the remainder in r1
xor r1, r19, r0 # start compliment if original dividend was <0
subq r0, r19, r0 # finish complement
ret r31, (r26)
# long ots_mod_l(long dividend, long modulus)
# signed 64 bit modulus support
#
# This entry could be MUCH more optimized. It doesn't even try to use
# UMULH division currently... (A casualty of time-to-market.)
# Note that mod is only used by Ada and PL/I.
#
#.align 4
.globl _OtsModulus64
.aent _OtsModulus64
_OtsModulus64:
#ifdef VMS
ldq r27, <ots_div_addr-ots_rem_l>(r27)# start loading address of division data area
#endif
#ifdef OSF
ldgp gp, 0(r27) # load the global pointer
.frame sp, 0, r26
lda r27, _OtsDivData # start loading address of the division data area
#endif
#ifdef WNT
.frame sp, 0, r26
lda r27, _OtsDivData # load the division data table address
#endif
negq r17, r18 # first part of abs(divisor)
cmovge r17, r17, r18 # second part of abs(divisor)
subq r18, 1, r1 # start checking for power of 2
beq r17, divzer # check for 0
and r18, r1, r0 # second part of power-of-2 check
beq r0, ml_p2 # for powers of two, simply do a mask
# (note that the power-of-2 case MUST be used to handle
# the -maxint case due to the way the fix-up info is
# saved across the core routine call)
xor r16, r17, r28 # get xor of signs
clr r19 # don't need a bias if dividend and divisor have same sign
cmovlt r28, r17, r19 # bias is original divisor for different sign case
and r16, r17, r28 # if both dividend & divisor were neg. need to negate result
mov r18, r17 # move abs(divisor) into r17
negq r16, r18 # first part of abs(dividend)
cmovlt r16, r18, r16 # second part of abs(dividend)
cmplt r28, r31, r0 # get 1 if both operands were <0
sll r0, 63, r0 # get bit as the high bit
bis r0, r19, r19 # and MERGE with bias (0 -> no fixup, -maxint -> negate result,
# divisor > 0 - subtract remainder if non-zero, divisor < 0 -
# add remainder if non-zero)
bsr r28, div64 # use the core routine getting the remainder in r1
cmoveq r1, r31, r19 # don't do any fix-up if the remainder was zero
addq r19, r19, r18 # check to see if this is the negative/negative case, which just gets a negated remainder
subq r19, 1, r28 # wrap -maxint to positive
negq r1, r0 # move negated value, may abort later
cmovlt r28, r1, r0 # if both positive, or negative divisor, keep positive remainder
cmoveq r18, r31, r19 # now that negation is done, treat -maxint case as 0
addq r19, r0, r0 # add any bias (original divisor or 0)
ret r31, (r26) # and return
ml_p2: cmovge r17, r31, r17 # no bias if divisor was >= 0
and r16, r1, r1 # use the divisor-1 mask that's already in r1
cmoveq r1, r31, r17 # use zero if result was zero
addq r17, r1, r0 # do any biasing
ret r31, (r26) # and return
# unsigned long ots_div_ul(unsigned long dividend, unsigned long divisor)
# unsigned 64 bit division support
#
nop #.align 4
.globl _OtsDivide64Unsigned
.aent _OtsDivide64Unsigned
_OtsDivide64Unsigned:
#ifdef VMS
ldq r27, <ots_div_addr-ots_div_ul>(r27)# start loading address of division data area
#endif
#ifdef OSF
ldgp gp, 0(r27) # load the global pointer
.frame sp, 0, r26
lda r27, _OtsDivData # start loading address of the division data area
#endif
#ifdef WNT
.frame sp, 0, r26
lda r27, _OtsDivData # load the division data table address
#endif
lda r28, -table_max(r17) # test for table lookup
blt r17, dul_big # big divisors can (must) be handled by a simple comparison
addq r17, r17, r18 # compute divisor*2
srl r16, 33, r19 # can this be handled via the fast path for 31 bit dividends?
beq r17, divzer # check for 0
s8addq r18, r27, r18 # finish computing table entry addr (table addr+divisor*16)
bgt r28, dul_lrgdiv # branch if large divisor
cmpule r17, r16, r0 # is the dividend < divisor?
bne r19, dul_64bit # if the dividend doesn't fit in 31 bits, use the larger umulh form
ldq r27, recip32_o(r18) # load approximate 32b reciprocal & shift count
beq r0, dul_end # fast out for divisor > dividend
blt r27, dul_smpwr2 # go handle powers of 2 specially
umulh r16, r27, r0 # 32b recip
ret r31, (r26) #
# the 64 bit case is at a disadvantage to the 32b case because it needs
# a fix-up at the end, which prevents the latency of the umulh from
# being partially absorbed by the procedure return and anything that
# immediately follows that doesn't interlock.
nop
dul_64bit:
ldq r1, recip64_o(r18) # load approximate 64b reciprocal
ldq r27, shift_o(r18) # load shift count (low 6 bits are all that matters)
beq r0, dul_end # fast out for divisor > dividend
beq r1, dul_smpwr2 # go handle powers of 2 specially
umulh r16, r1, r0 # start multiplication for division step
zap r16, 0x0f, r18 # split dividend into two parts
zapnot r16, 0x0f, r16
srl r18, r27, r18 # position the high part
addq r0, r16, r0 # add hidden * low dividend (no carry)
srl r0, r27, r0 # shift into place
addq r0, r18, r0 # add hidden * high dividend
ret r31, (r26)
dul_smpwr2:
srl r16, r27, r0 # shift the result into place
dul_end: ret r31, (r26) #
dul_lrgdiv:
bsr r28, div64 # use the core routine
ret r31, (r26)
# divisor with the sign bit set. two possible results,
# 1 if divisor <= dividend, or 0 otherwise
dul_big:
cmpule r17, r16, r0
ret r31, (r26)
# long unsigned ots_rem_ul(long unsigned dividend, long unsigned divisor)
# unsigned 64 bit remainder support
#
#.align 4
.globl _OtsRemainder64Unsigned
.aent _OtsRemainder64Unsigned
_OtsRemainder64Unsigned:
#ifdef VMS
ldq r27, <ots_div_addr-ots_rem_ul>(r27)# start loading address of division data area
#endif
#ifdef OSF
ldgp gp, 0(r27) # load the global pointer
.frame sp, 0, r26
lda r27, _OtsDivData # start loading address of the division data area
#endif
#ifdef WNT
.frame sp, 0, r26
lda r27, _OtsDivData # load the division data table address
#endif
lda r28, -table_max(r17) # test for table lookup
subq r17, 1, r1 # first part of power-of-2 check
blt r17, rul_big # big divisors can (must) be handled by a simple comparison
and r17, r1, r18 # second part of power-of-2 check
bgt r28, rul_lrgdiv # branch if large divisor
addq r17, r17, r0 # compute divisor*2 for table lookup
beq r18, rul_pwr2 # if zero, divisor is a power of 2
s8addq r0, r27, r27 # finish computing table entry addr (table addr+divisor*16)
ldq r1, recip64_o(r27) # load approximate reciprocal
cmpult r16, r17, r18 # is the dividend < divisor?
bne r18, rul_lss # if so, fast exit
ldq r19, shift_o(r27) # load the shift count
umulh r16, r1, r0 # multiplication for division step
blt r16, rul_carry # careful handling if >= 2^63
addq r0, r16, r0 # add hidden bit * dividend
srl r0, r19, r0
mulq r0, r17, r0 # multiply back to get value to subtract
subq r16, r0, r0
ret r31, (r26) # and return
rul_carry:
zap r16, 0x0f, r18 # split dividend into two parts
zapnot r16, 0x0f, r1
srl r18, r19, r18 # position the high part
addq r0, r1, r0 # add hidden * low dividend (no carry)
srl r0, r19, r0 # shift into place
addq r0, r18, r0 # add hidden * high dividend
mulq r0, r17, r0 # multiply back to get value to subtract
subq r16, r0, r0
ret r31, (r26)
rul_pwr2:
beq r17, divzer # check for 0
and r16, r1, r0 # use x-1 to mask
ret r31, (r26)
rul_lss:
mov r16, r0
ret r31, (r26)
# divisors with the sign bit set. two possible results,
# dividend if dividend < divisor, or dividend-divisor otherwise
rul_big:
cmpult r16, r17, r1
subq r16, r17, r0
cmovne r1, r16, r0
ret r31, (r26)
nop
rul_lrgdiv:
bsr r28, div64 # use the core routine getting the remainder in r1
mov r1, r0 # return remainder as the result in r0
ret r31, (r26)
# Division-by-zero handling
# (forward branch from all routines, out of the way here as well.)
#
divzer: lda r16, GEN_INTDIV(r31) # load GENTRAP code for division by zero
clr r0 # return 0 for the result
clr r1 #
#ifdef VMS
gentrap # signal the error
#endif
#ifdef OSF
call_pal PAL_gentrap
#endif
#ifdef WNT
# Since I couldn't find this in a header file anywhere for NT...
#define PAL_gentrap 0xaa
call_pal PAL_gentrap
#endif
ret r31, (r26) # return (in case someone tries to continue)
.set at
.set reorder
.end _OtsDiv