summaryrefslogtreecommitdiffstats
path: root/private/ntos/tdi/tcpip/tcp/alpha/xsum.s
diff options
context:
space:
mode:
Diffstat (limited to 'private/ntos/tdi/tcpip/tcp/alpha/xsum.s')
-rw-r--r--private/ntos/tdi/tcpip/tcp/alpha/xsum.s271
1 files changed, 271 insertions, 0 deletions
diff --git a/private/ntos/tdi/tcpip/tcp/alpha/xsum.s b/private/ntos/tdi/tcpip/tcp/alpha/xsum.s
new file mode 100644
index 000000000..032d40747
--- /dev/null
+++ b/private/ntos/tdi/tcpip/tcp/alpha/xsum.s
@@ -0,0 +1,271 @@
+// TITLE("Compute Checksum")
+//++
+//
+// Copyright (c) 1994 Microsoft Corporation
+//
+// Module Name:
+//
+// xsum.s
+//
+// Abstract:
+//
+// This module implements a function to compute the checksum of a buffer.
+//
+// Author:
+//
+// John Vert (jvert) 11-Jul-1994
+//
+// Environment:
+//
+// Revision History:
+//
+//--
+
+#include "ksalpha.h"
+
+
+ SBTTL("Compute Checksum")
+//++
+//
+// ULONG
+// tcpxsum (
+// IN ULONG Checksum,
+// IN PUSHORT Source,
+// IN ULONG Length
+// )
+//
+// Routine Description:
+//
+// This function computes the checksum of the specified buffer.
+//
+// Arguments:
+//
+// Checksum (a0) - Supplies the initial checksum value.
+//
+// Source (a1) - Supplies a pointer to the checksum buffer
+//
+// Length (a2) - Supplies the length of the buffer in words.
+//
+// Return Value:
+//
+// The computed checksum is returned as the function value.
+//
+//--
+
+ LEAF_ENTRY(tcpxsum)
+ zap a0, 0xf0, a0 // clear high half of a0
+ bis a1, zero, t6 // save initial buffer address
+ bis zero, zero, v0 // clear accumulated checksum
+
+//
+// Check if the buffer is quadword aligned.
+//
+// If the buffer is not quadword aligned, then add the leading words to the
+// checksum.
+//
+ ldq_u t0, 0(a1) // get containing quadword of first part
+ blbc a1, 10f // check for word alignment
+ beq a2, 65f // if zero bytes, don't do anything
+ extbl t0, a1, t1 // get leading byte
+ sll t1, 8, v0 // shift it to correct spot for later byte swap
+ addq a1, 1, a1 // increment buffer to first full word
+ subq a2, 1, a2 // decrement byte count
+
+10:
+ and a1, 6, t2 // check if buffer quadword aligned
+ beq t2, 20f // if eq, quadword aligned
+ extql t0, t2, t0 // extract bytes to checksum
+ and a1, 7, t3 // compute bytes summed
+ subq zero, t3, t3
+ addq t3, 8, t3
+ addq a1, 8, a1 // advance buffer address to next qword
+ bic a1, 7, a1 //
+ subq a2, t3, t2
+ blt t2, 55f // if ltz, too many, jump to residual code
+
+ addq v0, t0, v0 // add bytes to partial checksum
+ cmpult v0, t0, t1 // generate carry
+ addq t1, v0, v0 // add carry back into checksum
+
+ bis t2, zero, a2 // reduce count of bytes to checksum
+ beq t2, 60f // if eq, no more bytes
+
+20:
+//
+// Compute the checksum in 64-byte blocks
+//
+ bic a2, 7, t4 // subtract out residual bytes
+ beq t4, 40f // if eq, no quadwords to checksum
+ subq zero, t4, t2 // compute negative of byte count
+ and t2, 15 << 2, t3 // compute bytes in first iteration
+ ldq t0, 0(a1) // get first quadword to checksum
+ beq t3, 35f // if eq, full 64-byte block
+ subq a1, t3, a1 // bias buffer address by offset
+ bic t4, 64-1, t4 // subtract out bytes in first iteration
+ lda t2, 30f // get base address of code vector
+ addl t3, t3, t3 //
+ addq t3, t2, t2 // compute code vector offset
+ bis t0, zero, t1 // copy first quadword to checksum
+ jmp (t2) // dispatch
+
+
+30:
+//
+// The following code vector computes the checksum of a 64-byte block.
+//
+.set noreorder
+ ldq t1, 8(a1)
+ addq v0, t0, v0
+ cmpult v0, t0, t2
+ addq v0, t2, v0
+
+ ldq t0, 16(a1)
+ addq v0, t1, v0
+ cmpult v0, t1, t2
+ addq v0, t2, v0
+
+ ldq t1, 24(a1)
+ addq v0, t0, v0
+ cmpult v0, t0, t2
+ addq v0, t2, v0
+
+ ldq t0, 32(a1)
+ addq v0, t1, v0
+ cmpult v0, t1, t2
+ addq v0, t2, v0
+
+ ldq t1, 40(a1)
+ addq v0, t0, v0
+ cmpult v0, t0, t2
+ addq v0, t2, v0
+
+ ldq t0, 48(a1)
+ addq v0, t1, v0
+ cmpult v0, t1, t2
+ addq v0, t2, v0
+
+ ldq t1, 56(a1)
+ addq v0, t0, v0
+ cmpult v0, t0, t2
+ addq v0, t2, v0
+
+ addq a1, 64, a1
+ addq v0, t1, v0
+ cmpult v0, t1, t2
+ addq v0, t2, v0
+.set reorder
+
+ beq t4, 40f // if zero, end of block
+
+35:
+ ldq t0, 0(a1)
+//
+// The following loop is allowed to be reordered by the assembler for
+// optimal scheduling. It is never branched into.
+//
+ subq t4, 64, t4 // reduce byte count of longwords
+
+ ldq t1, 8(a1)
+ addq v0, t0, v0
+ cmpult v0, t0, t2
+ addq v0, t2, v0
+
+ ldq t0, 16(a1)
+ addq v0, t1, v0
+ cmpult v0, t1, t2
+ addq v0, t2, v0
+
+ ldq t1, 24(a1)
+ addq v0, t0, v0
+ cmpult v0, t0, t2
+ addq v0, t2, v0
+
+ ldq t0, 32(a1)
+ addq v0, t1, v0
+ cmpult v0, t1, t2
+ addq v0, t2, v0
+
+ ldq t1, 40(a1)
+ addq v0, t0, v0
+ cmpult v0, t0, t2
+ addq v0, t2, v0
+
+ ldq t0, 48(a1)
+ addq v0, t1, v0
+ cmpult v0, t1, t2
+ addq v0, t2, v0
+
+ ldq t1, 56(a1)
+ addq v0, t0, v0
+ cmpult v0, t0, t2
+ addq v0, t2, v0
+
+ addq a1, 64, a1
+ addq v0, t1, v0
+ cmpult v0, t1, t2
+ addq v0, t2, v0
+
+ bne t4, 35b // if ne zero, not end of block
+
+40:
+//
+// Check for any remaining bytes.
+//
+ and a2, 7, a2 // isolate residual bytes
+ beq a2, 60f // if eq, no residual bytes
+50:
+//
+// Checksum remaining bytes.
+//
+// The technique we use here is to load the final quadword, then
+// zero out the bytes that are not included.
+//
+ ldq t0, 0(a1) // get quadword surrounding remainder
+55:
+ ornot zero, zero, t1 // get FF mask
+ sll t1, a2, t2 // shift to produce byte mask
+ zap t0, t2, t0 // zero out bytes past end of buffer
+ addq v0, t0, v0 // add quadword to partial checksum
+ cmpult v0, t0, t1 // generate carry
+ addq t1, v0, v0 // add carry back into checksum
+60:
+//
+// Byte swap the 64-bit checksum if the start of the buffer was not word aligned
+//
+ blbc t6, 65f
+ zap v0, 0xAA, t0 // isolate even bytes
+ sll t0, 8, t0 // shift even bytes into odd positions
+ srl v0, 8, t1 // shift odd bytes into even positions
+ zap t1, 0xAA, t1 // isolate odd bytes
+ bis t0, t1, v0 // merge bytes back together
+
+65:
+//
+// add computed checksum to original checksum, and fold the 64-bit
+// result down to 16 bits.
+//
+ addq v0, a0, v0 // add computed checksum to original
+ cmpult v0, a0, t0 // generate carry
+ addq v0, t0, v0 // add carry back into checksum
+
+//
+// swap the longwords in order to sum two longwords and their carry in one add.
+//
+ sll v0, 32, t0 // shift low longword into high
+ srl v0, 32, t1 // shift high longword into low
+ bis t1, t0, t5 // merge back together
+
+ addq v0, t5, t0 // produce sum + carry in high longword
+ srl t0, 32, t1 // shift back down to low half
+//
+// swap the words in order to sum two words and their carry in one add
+//
+ sll t1, 16, t2 // shift high word into low
+ srl t1, 16, t3 // shift low word into high
+ bis t2, t3, t4 // merge back together
+ addq t4, t1, t2 // produce sum and carry in high word
+ extwl t2, 2, v0 // extract result.
+ ret zero, (ra) // return
+
+ .end tcpxsum
+