private/crt32/misc/alpha/smovem.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667

 #++
 #  Copyright 1991, 1994, Digital Equipment Corporation
 # 
 #       ots_movem(char *dstptr INOUT, long dstlen INOUT,
 #                char *srcptr, long srclen)
 # 
 #       Move dstlen characters from *srcptr to *dstptr, possibly overlapping
 # 
 #       Special conventions: No stack space, r16-r21 and r27-r28 ONLY,
 #      no linkage pointer required, r16 is INOUT and points to the address
 #      following the move, r17 is INOUT and has the remaining destination
 #      length following the move.
 #      (Warning: The auto-loader potentially takes some regs across
 #      the call if this is being used in a shared lib. environment.)
 # 
 #   This is a GEM support routine for moving (possibly overlapping) memory
 #   from one address to another.  This is optimized for extremely high
 #   performance both for small blocks and large moves.  In order to reduce
 #   overhead for small cases, they are retired as quickly as possible,
 #   more case analysis is reserved for cases which will do more.  Note
 #   that while overlapping moves are supported, (unlike Sys V memcpy)
 #   routines), they are not quite as fast.
 # 
 #   Warning - This code is basically "expanded microcode".  Since it is
 #   executed so frequently in many contexts, it has been extensively "hand-
 #   optimized"...
 # 
 #   Note that this routine and ots_move are basically similar in many
 #   respects (same basic code), so maintenance should be done both
 #   places.  This routine is primarily provided for lower overhead (for
 #   short strings).
 #   [Except for the first few instructions, the recipe for creating OTS_MOVEM
 #   from OTS_MOVE is to change uses of R19->R21 and then R17->R19.]
 #
 #   This version of OTS_MOVEM provides longword granularity.
 #
 #   015           1 Sep 1994   WBN     Longword granularity version, based on
 #                              OTS_MOVEM_ALPHA.M64 version 014 and
 #                              OTS_MOVE_ALPHA_WNT.M64 version 015.
 #--

#include        "ots_defs.hs"

        # r16 = dst     --> r16 = end
        # r19 = dst_len --> r17 = remaining
        # r18 = src
        # r19 = src_len
        # destroys r18-r21, r27-r28

        .globl  _OtsMoveMinimum
        .ent    _OtsMoveMinimum
_OtsMoveMinimum:
        .set noat
        .set noreorder
        .frame  sp,0,r26
        .prologue       0
        subq    r17, r19, r20           # Which length is larger?
        cmovlt  r20, r17, r19           # Min to r19
        andnot  r16, 3, r21             # LW-aligned dst pointer
        subq    r19, 4, r20             # Get length-4
        beq     r19, done               # No memory accesses if length=0
        ldq_u   r28, (r18)              # Load first QW of source
        addq    r19, r18, r27           # Point to end of source
        subq    r17, r19, r17           # Set remaining length for return
        bge     r20, geq4               # Go handle lengths >= 4
        ldq_u   r27, -1(r27)            # Load last QW of source
        and     r16, 3, r16             # Get dst alignment within LW
        ldl     r19, (r21)              # Load first LW of destination
        addq    r20, r16, r20           # Get alignment+length-4
        extql   r28, r18, r28           # Extract first bytes of source
        bgt     r20, double             # Go handle LW crossing
        extqh   r27, r18, r27           # Extract last bytes of source
        addq    r20, 4, r20             # Get ending alignment in LW
        or      r27, r28, r28           # Combine halves of source
        insql   r28, r16, r28           # Position low part of source
        mskql   r19, r16, r18           # Keep low bytes of destination
        mskql   r28, r20, r28           # Trim off high bytes of source
        mskqh   r19, r20, r19           # Keep high bytes of destination
        or      r18, r28, r28           # Combine source with low dest
        or      r19, r28, r28           # Combine with high dest
        stl     r28, (r21)              # Store to destination
        addq    r21, r20, r16           # Point to end of dest for return
        ret     r31, (r26)

double: extqh   r27, r18, r27           # Extract last bytes of source
        ldl     r18, 4(r21)             # Load second LW of destination
        mskql   r19, r16, r19           # Keep low bytes of 1st dest LW
        or      r27, r28, r28           # Combine parts of source
        insql   r28, r16, r27           # Position start of source
        addq    r16, 4, r16             # Compute virtual start in LW
        insqh   r28, r16, r28           # Position end of source
        addq    r21, 4, r21             # Prepare to compute end address
        mskqh   r18, r20, r18           # Keep high bytes of 2nd dest LW
        mskql   r28, r20, r28           # Trim end of source to length
        or      r27, r19, r19           # Combine low source with 1st LW
        stl     r19, -4(r21)
        or      r28, r18, r18           # Combine high source with 2nd LW
        stl     r18, (r21)
        addq    r21, r20, r16           # Point to end of dest for return
done:   ret     r31, (r26)

 # Come here to move >= 4 bytes.
 #
 # r16-> dst
 # r17 = remaining length for return
 # r18-> src
 # r19 = length
 # r20 = len-4
 # r21-> LW-aligned dst
 # r27 = src+len
 # r28 = first src QW

geq4:   subq    r20, 4, r19             # At least 8 bytes to move?
        subq    r16, r27, r27           # Check if dst >= src+len
        blt     r19, lss8               # Move 4..7 bytes
        subq    r18, r16, r19           # Check if src >= dst
        bge     r27, ok1                # Forward OK if whole src precedes dst
        blt     r19, reverse            # Go backwards if src < dst < src+len
ok1:    and     r16, 7, r16
        addq    r16, r20, r27           # Alignment + length - 4
        bne     r16, part               # Part of first QW to be skipped
        subq    r20, 4, r20             # At least 8 bytes to be stored?
        beq     r27, simple             # Only low LW to be stored
        and     r18, 7, r27             # Is src address now aligned?
        blt     r20, shortq             # Dst ends in first QW
        subq    r20, 32, r19            # At least 4 quadwords left to move?
        beq     r27, align              # Go handle matching alignment

        # Src alignment differs from dst alignment.
        # r16 = dst alignment
        # r17 = remaining length for return
        # r18 = src-8 after 1st move
        # r19 
        # r20 = initial length-8
        # r21 = initial dst
        # r27 = dst QW if dst wasn't aligned
        # r28 = source QW

misal:  or      r16, r21, r21           # Put alignment back with dst ptr ***
        ldq_u   r19, 8(r18)             # Load same or next source QW
        extql   r28, r18, r28           # Get first part of source to store
        addq    r20, r16, r20           # Adjust length for partial move
        mskql   r27, r21, r27           # Trim destination for merge
        extqh   r19, r18, r16           # Get second part of source
        subq    r20, 24, r20            # At least 4 more quadwords?
        or      r28, r16, r28           # Combine pieces of source
        mskqh   r28, r21, r28           # Trim low junk off source
        andnot  r21, 7, r21             # Adjust dst for partial move
        bge     r20, unrol2             # Taken branch for long strings
        addq    r20, 16, r16            # Add back: how many whole QW's?
        nop
short2: and     r20, 7, r20             # How many odd bytes?
        blt     r16, last               # Skip if no more whole QW's
        or      r28, r27, r28           # Combine pieces
        stq     r28, (r21)
        extql   r19, r18, r27           # Get last part of prior src QW
        ldq_u   r19, 16(r18)            # Load another src QW
        addq    r21, 8, r21             # Update dst
        subq    r16, 8, r16             # More whole QW's?
        addq    r18, 8, r18             # Update src
        blt     r16, lastx              # Skip if no more whole QWs
        extqh   r19, r18, r28           # Get first part of this src QW
        addq    r18, 8, r18             # Update src again
        or      r28, r27, r28           # Combine pieces
        stq     r28, (r21)
        extql   r19, r18, r27           # Get last part of this src QW
        ldq_u   r19, 8(r18)             # Load another src QW
        addq    r21, 8, r21             # Update dst
lastx:  extqh   r19, r18, r28           # Get first part of this src QW
last:   addq    r18, r20, r16           # Point to end-8 of src
        beq     r20, done_u             # Skip if no odd bytes
        or      r28, r27, r28           # Combine parts of last whole QW
        ldq_u   r27, 7(r16)             # Load final (maybe same) src QW
        subq    r20, 4, r16             # More than 4 bytes left?
        stq     r28, (r21)              # Store last whole QW
        extql   r19, r18, r19           # Get last part of this src QW
        extqh   r27, r18, r27           # Get what we need from final src QW
joinx:  ldq     r28, 8(r21)             # Load last QW of destination
        or      r19, r27, r27           # Combine pieces of source
        mskql   r27, r20, r27           # Trim to length
        mskqh   r28, r20, r28           # Make room in destination
        bgt     r16, done_u             # Go store a whole QW
        addq    r20, 8, r20             # Increment length for return
        or      r28, r27, r28           # Insert src into dst
        stl     r28, 8(r21)             # Final LW
        addq    r21, r20, r16           # Point to end of dst for return
        ret     r31, (r26)

 # Come here to move 4 thru 7 bytes.
 #
lss8:   addq    r18, r19, r27           # Recover src+len-8
        and     r16, 3, r16             # Dst alignment within LW
        ldq_u   r27, 7(r27)             # Load last part of source
        extql   r28, r18, r28           # Extract first part of source
        beq     r16, lw                 # Handle LW-aligned dst
        extqh   r27, r18, r27           # Extract last part of source
        ldl     r18, (r21)              # Load first LW of dst
        addq    r16, r20, r20           # align+len-4 of dst
        or      r28, r27, r28           # Complete source
        mskql   r28, r19, r28           # Trim source to length
        mskql   r18, r16, r18           # Make room in dst
        insql   r28, r16, r27           # Position src like dst
        addq    r16, r19, r19           # Align+len-8 of dst
        or      r27, r18, r18           # Merge
        stl     r18, (r21)              # Store first LW of dst
        extql   r27, 4, r27             # Position next LW of src
        blt     r19, zz                 # Skip if not a whole LW
        stl     r27, 4(r21)             # Store the whole LW
        addq    r21, 4, r21             # Adjust pointer
        subq    r20, 4, r20             # Adjust ending alignment
        beq     r19, donezz             # Exit if done
        insqh   r28, r16, r27           # Position remainder of src     
zz:     ldl     r28, 4(r21)             # Load last dst LW
        mskqh   r28, r20, r28           # Make room in dst
        or      r28, r27, r27           # Merge
        stl     r27, 4(r21)             # Final store
donezz: addq    r21, r20, r16           # End address -4
        addq    r16, 4, r16
        ret     r31, (r26)

lw:     extqh   r27, r18, r27           # Extract last part of source
        addq    r21, 4, r16             # Adjust for return value
        beq     r20, lwdone             # Skip if exactly 4 bytes
        ldl     r19, 4(r21)             # Load next dst LW
        or      r27, r28, r28           # Complete source
        stl     r28, (r21)              # Store first LW
        extql   r28, 4, r28             # Position rest of source
        mskqh   r19, r20, r27           # Make room in dst
        mskql   r28, r20, r28           # Trim src
        or      r27, r28, r28           # Merge
        stl     r28, 4(r21)
        addq    r16, r20, r16           # Update return value
        ret     r31, (r26)

lwdone: or      r27, r28, r28           # Merge
        stl     r28, (r21)
        ret     r31, (r26)

 # Move 4 bytes to an aligned LW.
 #
simple: ldq_u   r27, 3(r18)             # Load last QW of source
        extql   r28, r18, r28           # Position first QW
        addq    r21, 4, r16             # Point to end of dst for return
        extqh   r27, r18, r27           # Position last QW
        or      r28, r27, r28           # Merge
        stl     r28, (r21)              # Store
        ret     r31, (r26)


 # Dst is not aligned.  Check whether first write is to a LW or a QW,
 # and whether that finishes the move.  Then see if src alignment
 # matches, and read/rewrite the first dst quadword.
 #
 # r16 = dst alignment in QW
 # r17 = remaining length for return
 # r18-> src
 # r19
 # r20 = len-4
 # r21-> LW-aligned dst
 # r27 = QW_alignment + length - 4
 # r28 = first src QW

        #.align quad

part:   subq    r27, 4, r19             # Does dst end in first QW?
        ldq_u   r27, (r21)              # Load first dst QW
        blt     r19, shortu             # Go handle short store
        and     r16, 4, r19             # Does it start in high LW?
        subq    r18, r16, r18           # Adjust src for this partial move
        beq     r19, quad               # Whole QW to be touched
        extql   r28, r18, r19           # Position first part of source
        ldq_u   r28, 7(r18)             # Get next (or same) src QW
        mskql   r27, r16, r27           # Trim destination for merge
        addq    r20, r16, r20           # Len + alignment...
        extqh   r28, r18, r28           # Position second part of source
        subq    r20, 4, r20             # Len+alignment-8 = remaining len
        or      r28, r19, r28           # Pieces of source
        mskqh   r28, r16, r19           # Trim junk preceding source
        ldq_u   r28, 7(r18)             # Get src QW again **
        or      r27, r19, r19           # Combine other source piece
        extql   r19, 4, r19             # Get the high LW
        stl     r19, (r21)              # Store just that

 # Now at a QW boundary.  Is there a QW left to store?
 # Is the source QW aligned?

        andnot  r21, 7, r21             # Adjust dst pointer to next-8
        subq    r20, 8, r19             # Got a QW more?
        and     r18, 7, r27             # Src aligned?
        blt     r19, short3             # Too short
        addq    r21, 8, r21
        subq    r20, 8, r20
        ldq_u   r28, 8(r18)
        addq    r18, 8, r18
        subq    r20, 32, r19            # Prepare for unrolled loop
        beq     r27, align              # Alignment matches
        or      r31, r31, r27
        or      r31, r31, r16
        br      r31, misal

shortu: addq    r18, r20, r20           # Point to end-4 of src
        ldq_u   r20, 3(r20)             # Get last QW of source
        extql   r28, r18, r28           # Fetch first QW of source
        extqh   r20, r18, r20           # Fetch last QW of source
        mskql   r27, r16, r18           # Clear from start thru end of dst
        mskqh   r27, r19, r27           # Clear from 0 to end of dst
        or      r28, r20, r28           # Combine src pieces
        insql   r28, r16, r28           # Position src
        or      r27, r18, r27           # Combine dst pieces
        mskql   r28, r19, r28           # Trim src
        addq    r21, r19, r20           # Final pointer for return
        or      r28, r27, r28           # Merge src & dst
        stq_u   r28, (r21)              # Store it
        addq    r20, 8, r16
        ret     r31, (r26)
        
quad:   and     r18, 7, r19             # Is src address now aligned?
        subq    r20, 4, r20             # Get length-8
        bne     r19, misal              # Go handle mismatched alignment
        mskqh   r28, r16, r28           # Keep desired part of source
        addq    r20, r16, r20           # Adjust count for this partial move
        mskql   r27, r16, r27           # Keep desired part of destination QW
        subq    r20, 32, r19            # At least 4 quadwords left to move?
        or      r27, r28, r28           # Merge source and destination

        # Src alignment matches.
        # r16
        # r17 = remaining length for return
        # r18 = next src pointer -8
        # r19 = remaining length -32
        # r20
        # r21 = dst pointer
        # r27
        # r28 = dst quadword

align:  and     r19, 24, r20            # How many after a multiple of 4?
        bge     r19, unrol1             # Taken branch for long strings
        nop
short1: and     r19, 7, r19             # How many odd bytes?
        beq     r20, last28             # Skip if no more whole QWs after r28
        ldq     r27, 8(r18)             # Load next QW
        addq    r18, 8, r18
        stq     r28, (r21)              # Store prior QW
        subq    r20, 16, r20            # Map 8/16/24 to -8/0/8
        addq    r21, 8, r21
        blt     r20, last27             # Skip if no more after r27
        ldq     r28, 8(r18)             # Load next QW
        addq    r18, 8, r18
        stq     r27, (r21)              # Store prior QW
        addq    r21, 8, r21
        nop
        beq     r20, last28
        ldq     r27, 8(r18)             # Load next QW
        addq    r18, 8, r18
        stq     r28, (r21)              # Store prior QW
        addq    r21, 8, r21
last27: beq     r19, done27             # Skip if no odd bytes
        ldq     r28, 8(r18)             # Load one more src QW
        ldq     r18, 8(r21)             # Load last destination QW
        subq    r19, 4, r16             # More than 4 bytes to store?
        stq     r27, (r21)              # Store prior QW
        mskql   r28, r19, r27           # Trim source
        mskqh   r18, r19, r18           # Trim destination
        ble     r16, lastl              # Go store just a LW
lastq:  addq    r21, r19, r21           # End-8 of dst for return
        or      r27, r18, r27           # Merge src & dst
done27: stq_u   r27, 7(r21)             # Store last destination QW
        addq    r21, 8, r16             # End of dst for return
        ret     r31, (r26)

short3: addq    r18, r20, r16           # Point to end-8 of src
        beq     r20, donexx             # Completely done?
        ldq_u   r19, 7(r16)             # Load final src QW
        subq    r20, 4, r16             # Got more than a LW?
        beq     r27, joinx              # Don't include prior src if aligned
        extql   r28, r18, r27           # Last part of prior src QW
        extqh   r19, r18, r19           # First part of this src QW
        br      joinx

donexx: addq    r21, r20, r16
        addq    r16, 8, r16
        ret     r31, (r26)

last28: beq     r19, done28             # Skip if no odd bytes
        ldq     r27, 8(r18)             # Load one more src QW
        ldq     r18, 8(r21)             # Load last destination QW
        subq    r19, 4, r16             # More than 4 bytes to store?
        stq     r28, (r21)              # Store prior QW
        mskql   r27, r19, r27           # Trim source
        mskqh   r18, r19, r18           # Trim destination
        bgt     r16, lastq              # Go store a QW
lastl:  addq    r19, 8, r19             # Increment length for return
        or      r27, r18, r27           # Merge src & dst
        stl     r27, 8(r21)             # Store last destination LW
        addq    r21, r19, r16           # End of dst for return
        ret     r31, (r26)

shortq: addq    r18, r20, r16           # Point to end-8 of src
        ldq     r27, (r21)              # Get dst QW
        extql   r28, r18, r28           # Position first src QW
        ldq_u   r19, 7(r16)             # Get last QW of src
        mskqh   r27, r20, r27           # Mask dst QW
        extqh   r19, r18, r19           # Position last src QW
        or      r19, r28, r28           # Merge
        mskql   r28, r20, r28           # Trim src QW
done_u: addq    r21, r20, r21           # End-8 of dst for return
        or      r28, r27, r28           # Combine pieces
done28: stq_u   r28, 7(r21)             # Store last destination QW
        addq    r21, 8, r16             # End of dst for return
        ret     r31, (r26)

 # Unrolled loop for long moves with matching alignment within QW.
 # Each iteration moves two cache blocks.
 # We try to schedule the cache misses to avoid a double miss
 # in EV4 pass 2.1 chips.  If the source alignment within a cache
 # block is exactly 3, alter it, since that case runs slower.
 #
 # R16
 # R17 = remaining length for return
 # R18 = src pointer
 # R19 = remaining length (to load) - 32
 # R20 = length & 24 (needed at return)
 # R21 = dst pointer
 # R27
 # R28 = QW from 0(R18) to store at 0(R21), both on input and at return
 #       

        #.align quad

unrol1: ldq     r27,  32(r18)           # Cache miss here; later loads hit.
          subq  r19, 48, r16            # Six more quadwords?
        and     r18, 16, r20            # Starting in 2nd half of cache block?
          blt   r16, uent1              # If not 6 more, don't adjust.
        ldq     r16,   8(r18)
          beq   r20, utop1              # If in 1st half, don't adjust.
        ldq     r27,  48(r18)           # Cache miss here
          addq  r18, 16, r18
        stq     r28,    (r21)           # Adjust by going ahead 1/2 block.
          addq  r21, 16, r21
        ldq     r28,    (r18)
          subq  r19, 16, r19
        stq     r16,  -8(r21)
          nop
        ldq     r16,   8(r18)
utop1:    subq  r19, 32, r19
        
uloop1: ldq     r20,  64(r18)           # Cache miss here
        stq     r28,    (r21)
        ldq     r28,  16(r18)
        stq     r16,   8(r21)
        ldq     r16,  24(r18)
          addq  r18, 64, r18
        stq     r28,  16(r21)
          mov   r20, r28
        stq     r16,  24(r21)
          addq  r21, 64, r21
        ldq     r20, -24(r18)
          subq  r19, 32, r19
        blt     r19, uexit1
          ldq   r16,  32(r18)           # Cache miss here
        stq     r27, -32(r21)
        ldq     r27, -16(r18)
        stq     r20, -24(r21)
        ldq     r20,  -8(r18)
        stq     r27, -16(r21)
          mov   r16, r27
        stq     r20,  -8(r21)
uent1:    subq  r19, 32, r19
        ldq     r16,   8(r18)
          bge   r19, uloop1

        # finish last block of 4 quadwords
        #
ubot1:  stq     r28,   (r21)
          mov   r27, r28                # Position last QW for return
        ldq     r27,   16(r18)
          addq  r18, 32, r18
        stq     r16,  8(r21)
          addq  r21, 32, r21
uex1a:  ldq     r16, -8(r18)
          and   r19, 24, r20            # Recover count of remaining QW's
        stq     r27, -16(r21)
        stq     r16, -8(r21)
        br      r31, short1

        nop
uexit1: stq     r27, -32(r21)           # Here if exit from middle of loop
        ldq     r27, -16(r18)
        stq     r20, -24(r21)
        br      r31, uex1a              # Join common exit sequence

        #.align quad

unrol2: ldq_u   r16, 16(r18)            # Load next src QW
          extql   r19, r18, r19         # Get last part of prior one
        or      r28, r27, r28           # Combine pieces
          stq     r28, (r21)            # Store prior dst QW
        subq    r20, 24, r20            # Update loop counter
        extqh   r16, r18, r28           # Get first part of a src QW
        ldq_u   r27, 24(r18)            # Load next src QW
          extql   r16, r18, r16         # Get last part of prior one
        or      r28, r19, r28           # Combine pieces
          stq     r28, 8(r21)           # Store prior dst QW
        addq    r21, 24, r21            # Update dst pointer
        extqh   r27, r18, r28           # Get first part of a src QW
        ldq_u   r19, 32(r18)            # Load next src QW
          extql   r27, r18, r27         # Get last part of prior one
        or      r28, r16, r28           # Combine pieces
          stq     r28, -8(r21)          # Store prior dst QW
        addq    r18, 24, r18            # Update src pointer
        extqh   r19, r18, r28           # Get first part of a src QW
        bge     r20, unrol2             # Repeat as needed
        addq    r20, 16, r16            # How many whole quadwords left?
        br      r31, short2             # Go handle leftovers
          nop

        # Must move in reverse order because of overlap.
        # r16 = dst address
        # r17 = remaining length for return
        # r18 = src address
        # r19
        # r20 = len-4 (>= 0)
        # r21
        # r27
        # r28

 # Not yet LW-granularity...

reverse:
        subq    r20, 4, r20             # This code expects len-8
        addq    r20, r18, r18           # Point to end-8 of source
        addq    r20, r16, r19           # Point to end-8 of destination
        and     r19, 7, r21             # Is destination aligned?
        ldq_u   r28, 7(r18)             # Get source QW
        addq    r19, 8, r16             # Point to end of dst for return
        bne     r21, rpart              # Skip if partial write needed
        and     r18, 7, r27             # Is source aligned too?
        beq     r27, ralign             # Skip if so
        ldq_u   r21, (r18)              # Handle aligned dst, unaligned src
        subq    r20, 8, r20
        extqh   r28, r18, r28
        extql   r21, r18, r27
        br      r31, rwhole

rmis:   ldq_u   r21, (r18)              # Load same or preceding src QW
        extqh   r28, r18, r28           # Get last part of source to store
        mskqh   r27, r16, r27           # Keep high-address part of dst
        extql   r21, r18, r21
        subq    r20, 8, r20             # How many more whole QW's?
        or      r21, r28, r28
        ldq_u   r21, (r18)              # Reload source QW
        mskql   r28, r16, r28           # Trim source to length 
rwhole: blt     r20, rlast2             # Skip if no more whole QW's
rloop2: or      r28, r27, r28           # Combine pieces
        stq     r28, (r19)
rent2:  extqh   r21, r18, r27
        ldq_u   r21, -8(r18)
        subq    r20, 8, r20
        subq    r19, 8, r19
        extql   r21, r18, r28
        subq    r18, 8, r18
        bge     r20, rloop2
rlast2: and     r20, 7, r20
        beq     r20, rdone2
        or      r28, r27, r28
        subq    r18, r20, r27
        stq     r28, (r19)
rl2ent: subq    r31, r20, r20
        ldq_u   r27, (r27)
        extqh   r21, r18, r21
        ldq     r28, -8(r19)
        subq    r19, 8, r19
        extql   r27, r18, r27
        mskql   r28, r20, r28
        or      r27, r21, r27
        mskqh   r27, r20, r27
        and     r20, 4, r21             # Ending in high LW?
        bne     r21, rdone3             # Only longword store at the end
rdone2: or      r28, r27, r28
        stq     r28, (r19)
        ret     r31, (r26)

rdone3: or      r28, r27, r28
        extql   r28, 4, r28
        stl     r28, 4(r19)
        ret     r31, (r26)

rpart:  ldq_u   r27, 7(r19)             # Get dst QW
        subq    r21, 8, r21             # Get negative of bytes not moved
        subq    r18, r21, r18           # From src-8, get src after partial
        subq    r20, r21, r20           # Adjust length for partial move
        subq    r19, r21, r19           # Adjust dst pointer
        addq    r21, 4, r21             # End alignment - 4
        ble     r21, r_lw               # Only storing the low longword?
        and     r18, 7, r21             # Src alignment now matching dst?
        bne     r21, rmis               # Go back if not
        mskql   r28, r16, r28           # Keep low addresses of src QW
        mskqh   r27, r16, r27           # Keep high address of dst QW
ralign: subq    r20, 8, r20             # How many more whole QW's?
        or      r27, r28, r28           # Combine
        blt     r20, rlast1             # Skip if this is the end
rloop1: stq     r28, (r19)              # Store one QW
rent1:  subq    r20, 8, r20             # Decrement length
        ldq     r28, -8(r18)            # Load preceding QW
        subq    r19, 8, r19             # Decrement dst pointer
        subq    r18, 8, r18             # Decrement src pointer
        bge     r20, rloop1             # Repeat for each whole QW
rlast1: and     r20, 7, r20             # How many odd bytes?
        beq     r20, rdone              # Skip if none
        ldq     r27, -8(r18)            # Get another source QW
        subq    r31, r20, r20           # Get byte # to end at
        stq     r28, (r19)
rl_ent: ldq     r28, -8(r19)
        subq    r19, 8, r19             # Adjust dst pointer again
        mskqh   r27, r20, r27           # Keep top of src QW
        and     r20, 4, r21             # Ending in high LW?
        mskql   r28, r20, r28           # Keep bottom of dst QW
        bne     r21, rdone4             # Only longword store at the end
        or      r27, r28, r28           # Combine
rdone:  stq     r28, (r19)              # Store last QW
        ret     r31, (r26)

rdone4: or      r27, r28, r28           # Combine
        extql   r28, 4, r28             # Get high part
        stl     r28, 4(r19)             # Store last LW
        ret     r31, (r26)

r_lw:   and     r18, 7, r21             # Src alignment now matching dst?
        bne     r21, rmislw             # Go back if not
        mskql   r28, r16, r28           # Keep low addresses of src QW
        mskqh   r27, r16, r27           # Keep high address of dst QW
        subq    r20, 8, r20             # How many more whole QW's?
        or      r27, r28, r28           # Combine
        blt     r20, rlast1_lw          # Skip if this is the end
        stl     r28, (r19)              # Store one QW
        br      r31, rent1

rlast1_lw:
        and     r20, 7, r20             # How many odd bytes?
        ldq     r27, -8(r18)            # Get another source QW
        subq    r31, r20, r20           # Get byte # to end at
        stl     r28, (r19)
        br      rl_ent

rmislw: ldq_u   r21, (r18)              # Load same or preceding src QW
        extqh   r28, r18, r28           # Get last part of source to store
        mskqh   r27, r16, r27           # Keep high-address part of dst
        extql   r21, r18, r21
        subq    r20, 8, r20             # How many more whole QW's?
        or      r21, r28, r28
        ldq_u   r21, (r18)              # Reload source QW
        mskql   r28, r16, r28           # Trim source to length 
        blt     r20, rlast2_lw          # Skip if no more whole QW's
        or      r28, r27, r28           # Combine pieces
        stl     r28, (r19)
        br      r31, rent2

rlast2_lw:
        and     r20, 7, r20
        or      r28, r27, r28
        subq    r18, r20, r27
        stl     r28, (r19)
        br      r31, rl2ent

        .set at
        .set reorder
        .end    _OtsMove