summaryrefslogtreecommitdiffstats
path: root/private/ntos/nthals/halalpha/errframe.h
blob: ced43eb08199c44b8023972be60882a3902010e1 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
/*++

Copyright (c) 1995,1996  Digital Equipment Corporation

Module Name:

    errframe.h

Abstract:

    Definitions for both the correctable and uncorrectable error
    frames.

Author:

    Joe Notarangelo  10-Mar-1995
    Chao Chen        24-Apr-1995

Environment:

    Kernel mode only.

Revision History:

0.1   28-Feb-1995 Joe Notarangelo   Initial version.

0.2   10-Mar-1995 Joe Notarangelo   Incorporate initial review comments from
                                    6-Mar-95 review with: C. Chen, S. Jenness,
                                    Bala, E. Rehm. 

0.3   24-Apr-1995 Chao Chen         Made into .h file for inclusion by other
                                    modules.

0.4	  16-Apr-1996 Gene Morgan		Add fields to SystemError struct for PSU,
									Wachdog events.

--*/

#ifndef ERRFRAME_H
#define ERRFRAME_H

#include "errplat.h"

/* 
 *
 * Common defines.
 *
 */

#define ERROR_FRAME_VERSION 0x1

#define UNIDENTIFIED     0x0
#define MEMORY_SPACE     0x1
#define IO_SPACE         0x2
#define PROCESSOR_CACHE  0x2
#define SYSTEM_CACHE     0x3
#define PROCESSOR_MEMORY 0x4
#define SYSTEM_MEMORY    0x5

#define CACHE_ERROR_MASK    0x2
#define MEMORY_ERROR_MASK   0x4

#define BUS_DMA_READ        0x1
#define BUS_DMA_WRITE       0x2
#define BUS_DMA_OP          0x3
#define BUS_IO_READ         0x4
#define BUS_IO_WRITE        0x5
#define BUS_IO_OP           0x6
#define TLB_MISS_READ       0x7
#define TLB_MISS_WRITE      0x8
#define VICTIM_WRITE        0x9

#define MAX_UNCORRERR_STRLEN  128

#define ERROR_FRAME_SIGNATURE   0xE22F2F2A  // ERRORFRA R=2 (18%16)

/* 
 *
 * Definitions for the correctable error frame.
 *
 */

//
// Correctable Error Flags
//    

typedef struct _CORRECTABLE_FLAGS{

  //
  // Address space
  //  Unidentified: 00
  //  Memory Space: 01
  //  I/O Space:    10
  //  Reserved:     11
  //

  ULONGLONG AddressSpace: 2;

  //
  // Error Interpretation Validity.
  //

  ULONGLONG ServerManagementInformationValid: 1;
  ULONGLONG PhysicalAddressValid: 1;
  ULONGLONG ErrorBitMasksValid: 1;
  ULONGLONG ExtendedErrorValid: 1;
  ULONGLONG ProcessorInformationValid: 1;
  ULONGLONG SystemInformationValid: 1;

  //
  // Memory Space Error Source:
  //
  //  Unidentified:    000
  //  ProcessorCache:  010
  //  SystemCache:     011
  //  ProcessorMemory: 100
  //  SystemMemory:    101
  //

  ULONGLONG MemoryErrorSource: 4;

  //
  // Driver Actions.
  //

  ULONGLONG ScrubError: 1;

  //
  // Lost errors.
  //

  ULONGLONG LostCorrectable: 1;
  ULONGLONG LostAddressSpace: 2;
  ULONGLONG LostMemoryErrorSource: 4;

} CORRECTABLE_FLAGS, *PCORRECTABLE_FLAGS;

  //
  // Description of CORRECTABLE_FLAG structure:
  //
  // AddressSpace
  //
  //      Identifies the system address space that was the source of the
  //      correctable error.
  //
  // PhysicalAddressValid
  //
  //      Indicates if the physical address in the CORRECTABLE_ERROR 
  //      structure is valid.  A value of 1 indicates the physical address
  //      is valid, a value of 0 indicates the physical address is not
  //      valid.
  //
  // ErrorBitMasksValid
  //
  //      Indicates if the error bit mask fields in the CORRECTABLE_ERROR 
  //      structure are valid.  A value of 1 indicates the DataBitErrorMask
  //      and the CheckBitErrorMask are valid,  a value of 0 indicates they
  //      are not valid.
  //
  // ExtendedErrorValid
  //
  //      Indicates if the extended error information structure in the 
  //      CORRECTABLE_ERROR structure is valid.  A value of 1 indicates the 
  //      extended error information structure is valid, a value of 0 
  //      indicates that it is not.
  //
  // ProcessorInformationValid
  //
  //      Indicates if the raw processor information pointer in the 
  //      CORRECTABLE_ERROR structure is valid.  A value of 1 indicates the 
  //      processor information is valid, a value of 0 indicates it is not.
  //
  // SystemInformationValid
  //
  //      Indicates if the raw system information pointer in the 
  //      CORRECTABLE_ERROR structure is valid.  A value of 1 indicates the 
  //      system information is valid, a value of 0 indicates it is not.
  //
  // ServerManagementInformationValid
  //
  //      Indicates that the server management information in the extended
  //      error information structure is valid.  The server management
  //      information relays information about failed fans or high
  //      temperature in the system.
  //
  //
  // MemoryErrorSource
  //
  //      Identifies the source of a memory error as either main error
  //      or cache for either a system or processor.
  //
  // ScrubError
  //
  //      Instructs the driver to scrub the correctable error.  If the
  //      value is 1 the driver should scrub the error, if the value is
  //      0 the driver must not scrub the error.
  //
  // LostCorrectable
  //
  //      Identifies if a lost correctable error has been reported.  A
  //      lost error is an error that reported by the hardware while
  //      correctable error handling for a previous error was in progress.
  //      A value of 1 indicates that a correctable error report was lost.
  //
  // LostAddressSpace
  //
  //      Identifies the system address space that was the source of the
  //      correctable error.  Valid only if LostCorrectable == 1.
  //
  // LostMemoryErrorSource
  //
  //      Identifies the source of a memory error as either main error
  //      or cache for either a system or processor.  Valid only if
  //      LostCorrectable == 1.
  //

//
// Processor information.
//

typedef struct _PROCESSOR_INFO{
  
  ULONG ProcessorType;
  ULONG ProcessorRevision;
  ULONG PhysicalProcessorNumber;
  ULONG LogicalProcessorNumber;

} PROCESSOR_INFO, *PPROCESSOR_INFO;

  //
  // Description of PROCESSOR_INFO structure:
  //
  // ProcessorType
  //
  //      Identifies the type of processor running on the system 
  //      (eg. 21064, 21066, 21164).  Note that the type of processor
  //      is expected to be consistent across the system for MP machines.
  //
  // ProcessorRevision
  //
  //      Identifies the revision number of the processor running on
  //      the system.  Note that the revision is expected to be consistent
  //      across the system for MP machines.
  //
  // PhysicalProcessorNumber
  //
  //      The physical processor number as numbered in the hardware 
  //      specifications.
  //
  // LogicalProcessorNumber
  //
  //      The logical processor number assigned to the processor by NT.
  //

//
// System Information.
//

typedef struct _SYSTEM_INFORMATION{
  
  UCHAR SystemType[8];
  ULONG ClockSpeed;
  ULONG OsRevisionId;
  ULONG PalMajorVersion;
  ULONG PalMinorVersion;
  UCHAR FirmwareRevisionId[16];
  ULONG SystemVariant;
  ULONG SystemRevision;
  UCHAR SystemSerialNumber[16];
  ULONG ModuleVariant;
  ULONG ModuleRevision;
  ULONG ModuleSerialNumber;
} SYSTEM_INFORMATION, *PSYSTEM_INFORMATION;

  //
  // Description of SYSTEM_INFORMATION structure:
  //
  // SystemType
  //
  //      Identifies the type of system that reported the error 
  //      (eg. "Sable", "Gamma").  The actual format and value of the
  //      SystemType string is system-specific.
  //
  // OsRevisionId
  //
  //      A numeric value that identifies the OS revision executing
  //      on the system that reported the fault.
  //
  // PalRevisionId
  //
  //      A numeric value that identifies the pal revision executing
  //      on the system that reported the fault.
  //
  // FirmwareRevisionId
  //
  //      A numeric value that identifies the firmware revision executing
  //      on the system that reported the fault.
  //
  // SystemVariant
  //
  //      A numeric value used to distinguish variants of the same system
  //      type.  The values and their interpretation are system_specific.
  //
  // SystemRevision
  //
  //      A numeric value used to distinguish revisions of the same system
  //      type.  The values and their interpretation are system_specific.
  //
  //      

//
// Extended Error Information.
//

typedef union _EXTENDED_ERROR{

  struct{
    struct{
      ULONG CacheLevelValid: 1;
      ULONG CacheBoardValid: 1;
      ULONG CacheSimmValid: 1;
    } Flags;
    PROCESSOR_INFO ProcessorInfo;
    ULONG CacheLevel;
    ULONG CacheBoardNumber;
    ULONG CacheSimm;
    ULONG TransferType;
    ULONG Reserved;
  } CacheError;

  struct{
    struct{
      ULONG MemoryBoardValid: 1;
      ULONG MemorySimmValid: 1;
    } Flags;
    PROCESSOR_INFO ProcessorInfo;
    ULONG MemoryBoard;
    ULONG MemorySimm;
    ULONG TransferType;
    ULONG Reserved[2];
  } MemoryError;

  struct{
    INTERFACE_TYPE Interface;
    ULONG BusNumber;
    PHYSICAL_ADDRESS BusAddress;
    ULONG TransferType;
    ULONG Reserved[5];
  } IoError;

  struct{
    struct{
      ULONG FanNumberValid: 1;
      ULONG TempSensorNumberValid: 1;
      ULONG PowerSupplyNumberValid: 1;
      ULONG WatchDogExpiredValid: 1;
    } Flags;
    ULONG FanNumber;
    ULONG TempSensorNumber;
    ULONG PowerSupplyNumber;
    ULONG WatchdogExpiration;
    ULONG Reserved[5];
  } SystemError;
  
} EXTENDED_ERROR, *PEXTENDED_ERROR;

  //
  // Description of EXTENDED_ERROR union:
  //
  // The EXTENDED_ERROR union has different interpretation depending
  // upon the AddressSpace and MemoryErrorSource fields of the
  // CORRECTABLE_FLAGS structure according to the following table:
  //
  // 1. AddressSpace=MemorySpace 
  //    MemoryErrorSource = 01x       use CacheError structure
  //
  // 2. AddressSpace=MemorySpace
  //    MemoryErrorSource = 10x       use MemoryError structure
  //
  // 3. AddressSpace=IoSpace          use IoError
  //
  // 4. AddressSpace=Unidentified     use SystemError
  //    MemoryErrorSource = 0x0       (note: ServerManagementInformationValid
  //                                         should be set)
  //
  //
  // CacheError.Flags
  //
  //      Identifies which fields of the CacheError structure are valid.
  //      CacheLevelValid = 1 indicates CacheLevel is valid.
  //      CacheBoardValid = 1 indicates CacheBoardNumber is valid.
  //      CacheSimmValid = 1 indicates CacheSimm is valid.
  //
  // CacheError.ProcessorInfo
  //
  //      Identifies the processor associated with the error.  Most 
  //      frequently will identify the processor that experienced and 
  //      reported the error.  However, it is possible that the processor 
  //      that is reporting has experienced the error from another 
  //      processor's cache.  This field is valid only if the 
  //      MemoryErrorSource = 010 .
  //
  // CacheError.CacheLevel
  //
  //      Identifies the level of the cache that caused the error.  Primary
  //      caches are Level 1, Secondary are Level 2, etc..  This field
  //      only valid if CacheError.Flags.CacheLevelValid == 1.
  //
  // CacheError.CacheBoardNumber
  //
  //      Identifies the board number of the cache that caused the error.  
  //      This field only valid if 
  //      CacheError.Flags.CacheBoardNumberValid == 1.
  //
  // CacheError.CacheSimm
  //
  //      Identifies the Cache Simm that caused the error.
  //      This field only valid if CacheError.Flags.CacheSimmValid == 1.
  //
  //
  // MemoryError.Flags
  //
  //      Identifies which fields of the CacheError structure are valid.
  //      CacheLevelValid = 1 indicates CacheLevel is valid.
  //      CacheBoardValid = 1 indicates CacheBoardNumber is valid.
  //      CacheSimmValid = 1 indicates CacheSimm is valid.
  //
  // MemoryError.ProcessorInfo
  //
  //      Identifies the processor associated with the error.  Most 
  //      frequently will identify the processor that experienced and 
  //      reported the error.  However, it is possible that the processor 
  //      that is reporting has experienced the error from another 
  //      processor's cache.  This field is valid only if the 
  //      MemoryErrorSource = 010 .
  //
  // MemoryError.MemoryBoardNumber
  //
  //      Identifies the board number of the cache that caused the error.  
  //      This field only valid if MemoryError.Flags.MemoryBoardValid == 1.
  //
  // MemoryError.MemorySimm
  //
  //      Identifies the memory SIMM that caused the error.  
  //      This field only valid if MemoryError.Flags.MemorySimmValid == 1.
  //
  //
  // IoError.Interface
  //
  //      Identifies the bus interface type (eg. PCI) of the bus that caused
  //      the correctable error.
  //
  // IoError.BusNumber
  //
  //      Identifies the bus number of the bus that caused the correctable 
  //      error.
  //
  // IoError.BusAddress
  //
  //      Identifies the bus address of the bus that caused the correctable 
  //      error.
  //

//
// Correctable Error Frame.
//

typedef struct _CORRECTABLE_ERROR{

  CORRECTABLE_FLAGS Flags;
  ULONGLONG PhysicalAddress;
  ULONGLONG DataBitErrorMask;
  ULONGLONG CheckBitErrorMask;
  EXTENDED_ERROR ErrorInformation;
  PROCESSOR_INFO ReportingProcessor;
  SYSTEM_INFORMATION System;
  ULONG RawProcessorInformationLength;
  PVOID RawProcessorInformation;
  ULONG RawSystemInformationLength;
  PVOID RawSystemInformation;
  ULONG Reserved;
  
} CORRECTABLE_ERROR, *PCORRECTABLE_ERROR;

  //
  // Description of CORRECTABLE_ERROR structure:
  //
  // Flags
  //
  //      The flags describe the various aspects of the error report.  The
  //      CORRECTABLE_FLAGS structure is described below.
  //
  // PhysicalAddress
  //
  //      The physical CPU address of the quadword that caused the correctable
  //      error report.  This value is optional, its validiity is indicated
  //      in the Flags.
  //
  // DataBitErrorMask
  //
  //      A mask that describes which data bits in the corrected word were
  //      in error.  A value of one in the mask indicates that the 
  //      corresponding bit in the data word were in error.
  //
  // CheckBitErrorMask
  //
  //      A mask that describes which check bits in the corrected word were
  //      in error.  A value of one in the mask indicates that the 
  //      corresponding bit in the check bits were in error.
  //
  // ErrorInformation
  //
  //      A structure that desribes interpretation of the error.  The values
  //      in the structure are optional, the EXTENDED_ERROR structure is
  //      described below.
  //
  // ReportingProcessor
  //
  //      A structure that describes the processor type on the system and
  //      the processor that reported the error.  PROCESSOR_INFO structure
  //      is described below.
  //      
  // RawProcessorInformationLength
  //
  //      The length of the raw processor error information structure in
  //      bytes.
  //
  // RawProcessorInformation
  //
  //      A pointer to the raw processor error information structure.  The
  //      definition of the structure is processor-specific.  The definitions
  //      for the known processors is defined in Appendix A below.
  //
  // System
  //
  //      A structure that describes the type of system for which the
  //      error was reported.  SYSTEM_INFORMATION structure is described below.
  //
  // RawSystemInformationLength
  //
  //      The length of the raw processor error information structure in
  //      bytes.
  //
  // RawSystemInformation
  //
  //      A pointer to the raw system error information structure.  The
  //      definition of the structure is system/ASIC-specific.  The 
  //      definitions for the known systems/ASICs is defined in Appendix B.
  //
  //


/* 
 *
 * Definitions for the uncorrectable error frame.
 *
 */

//
// Uncorrectable Error Flags.
//

typedef struct _UNCORRECTABLE_FLAGS{
  
  //
  // Address space
  //  Unidentified: 00
  //  Memory Space: 01
  //  I/O Space:    10
  //  Reserved:     11
  //

  ULONGLONG AddressSpace: 2;

  //
  // Error Interpretation Validity.
  //

  ULONGLONG ErrorStringValid: 1;
  ULONGLONG PhysicalAddressValid: 1;
  ULONGLONG ErrorBitMasksValid: 1;
  ULONGLONG ExtendedErrorValid: 1;
  ULONGLONG ProcessorInformationValid: 1;
  ULONGLONG SystemInformationValid: 1;

  //
  // Memory Space Error Source:
  //
  //  Unidentified:    000
  //  ProcessorCache:  010
  //  SystemCache:     011
  //  ProcessorMemory: 100
  //  SystemMemory:    101
  //

  ULONGLONG MemoryErrorSource: 4;

} UNCORRECTABLE_FLAGS, *PUNCORRECTABLE_FLAGS;

  //
  // The extended error information, processor information and system
  // information structures are identical for correctable and uncorrectable
  // errors.  The rules for printing the failing FRU are also identical
  // to the rules for the correctable errors.
  //

//
// Uncorrectable Error Frame.
//

typedef struct _UNCORRECTABLE_ERROR{

  UNCORRECTABLE_FLAGS Flags;
  CHAR ErrorString[MAX_UNCORRERR_STRLEN];
  ULONGLONG PhysicalAddress;
  ULONGLONG DataBitErrorMask;
  ULONGLONG CheckBitErrorMask;
  EXTENDED_ERROR ErrorInformation;
  PROCESSOR_INFO ReportingProcessor;
  SYSTEM_INFORMATION System;
  ULONG RawProcessorInformationLength;
  PVOID RawProcessorInformation;
  ULONG RawSystemInformationLength;
  PVOID RawSystemInformation;
  ULONG Reserved;

} UNCORRECTABLE_ERROR, *PUNCORRECTABLE_ERROR;


/* 
 *
 * Generic definitions for the error frame.
 *
 */

//
// Generic error frame.
//

typedef enum _FRAME_TYPE{
  CorrectableFrame,
  UncorrectableFrame
} FRAME_TYPE, *PFRAME_TYPE;

typedef struct _ERROR_FRAME{

  ULONG Signature;          // Needed to make sure that the buffer is infact
                            // an error frame.
  ULONG      LengthOfEntireErrorFrame;
  FRAME_TYPE FrameType;
  ULONG      VersionNumber;
  ULONG      SequenceNumber;
  ULONGLONG  PerformanceCounterValue;

  union {
    CORRECTABLE_ERROR CorrectableFrame;
    UNCORRECTABLE_ERROR UncorrectableFrame;
  };
  
} ERROR_FRAME, *PERROR_FRAME;

  //
  // Description of the generic error frame structure:
  //
  // FrameType
  //
  //      Specify which frame type we have.  Either correctable or
  //      uncorrectable.
  //
  // VersionNumber
  //
  //      Defines the version of the error structure.  Current version
  //      number = 1.
  //
  // SequenceNumber
  //
  //      A number that identifies a particular error frame.
  //
  // PerformanceCounterValue
  //
  //      The value of the system performance counter when the error was
  //      reported.  The value is determined during error processing by the
  //      HAL and so may be captured significantly after the hardware
  //      detected the error.  The value cannot be used for fine grained
  //      timing of when errors occurred but can be used for coarse grained
  //      timing and approximate timing.
  //
  // CorrectableFrame
  //
  //      Shared common area for a correctable frame.
  //
  // UncorrectableFrame
  // 
  //      Shared common area for an uncorrectable frame.
  //

#endif //ERRFRAME_H