Skip to content

Commit 63f66e8

Browse files
Add support for ras l3 fabric errors
Related-To: LOCI-3966 Signed-off-by: Mayank Raghuwanshi <mayank.raghuwanshi@intel.com> Source: 065232e
1 parent e7db46b commit 63f66e8

File tree

3 files changed

+36
-29
lines changed

3 files changed

+36
-29
lines changed

level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,20 +20,20 @@ static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToL
2020
{ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS,
2121
{"eu-attention"}},
2222
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
23-
{"soc-fatal-mdfi-east", "soc-fatal-mdfi-south", "soc-fatal-mdfi-west",
24-
"soc-fatal-psf-0", "soc-fatal-psf-1", "soc-fatal-psf-2", "soc-fatal-psf-csc-0",
23+
{"soc-fatal-psf-0", "soc-fatal-psf-1", "soc-fatal-psf-2", "soc-fatal-psf-csc-0",
2524
"soc-fatal-psf-csc-1", "soc-fatal-psf-csc-2", "soc-fatal-punit",
2625
"sgunit-fatal", "soc-nonfatal-punit", "sgunit-fatal", "sgunit-nonfatal", "gsc-nonfatal-mia-shutdown",
2726
"gsc-nonfatal-aon-parity", "gsc-nonfatal-rom-parity", "gsc-nonfatal-fuse-crc-check",
2827
"gsc-nonfatal-selfmbist", "gsc-nonfatal-fuse-pull", "gsc-nonfatal-sram-ecc", "gsc-nonfatal-glitch-det",
2928
"gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout"}},
3029
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
31-
{"fatal-fpu", "fatal-l3-fabric", "fatal-eu-grf", "fatal-sampler", "fatal-slm",
30+
{"fatal-fpu", "fatal-eu-grf", "fatal-sampler", "fatal-slm",
3231
"fatal-guc", "fatal-eu-ic", "fatal-subslice"}},
3332
{ZES_RAS_ERROR_CAT_DRIVER_ERRORS,
3433
{"driver-object-migration", "driver-engine-other", "driver-ggtt",
3534
"driver-gt-interrupt", "driver-gt-other", "driver-guc-communication",
36-
"driver-rps"}}};
35+
"driver-rps"}},
36+
{ZES_RAS_ERROR_CAT_L3FABRIC_ERRORS, {"soc-fatal-mdfi-east", "soc-fatal-mdfi-south", "soc-fatal-mdfi-west", "fatal-l3-fabric", "soc-fatal-cd0-mdfi"}}};
3737

3838
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsCorrectable = {
3939
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,

level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -58,25 +58,28 @@ constexpr uint64_t driverEngineOther = 3u;
5858
constexpr uint64_t initialUncorrectableCacheErrors = 2u;
5959
constexpr uint64_t initialEngineReset = 2u;
6060
constexpr uint64_t initialProgrammingErrors = 7u;
61-
constexpr uint64_t initialUncorrectableNonComputeErrors = 8u;
62-
constexpr uint64_t initialUncorrectableComputeErrors = 10u;
61+
constexpr uint64_t initialUncorrectableNonComputeErrors = 3u;
62+
constexpr uint64_t initialUncorrectableFabricErrors = 8u;
63+
constexpr uint64_t initialUncorrectableComputeErrors = 7u;
6364
constexpr uint64_t initialCorrectableComputeErrors = 6u;
6465
constexpr uint64_t initialUncorrectableDriverErrors = 5u;
6566

6667
constexpr uint64_t initialUncorrectableCacheErrorsTile0 = 2u;
6768
constexpr uint64_t initialCorrectableCacheErrorTile0 = 2u;
6869
constexpr uint64_t initialEngineResetTile0 = 2u;
6970
constexpr uint64_t initialProgrammingErrorsTile0 = 7u;
70-
constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 15u;
71+
constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 10u;
7172
constexpr uint64_t initialCorrectableNonComputeErrorsTile0 = 2u;
72-
constexpr uint64_t initialUncorrectableComputeErrorsTile0 = 11u;
73+
constexpr uint64_t initialUncorrectableComputeErrorsTile0 = 8u;
74+
constexpr uint64_t initialUncorrectableFabricErrorsTile0 = 8u;
7375
constexpr uint64_t initialCorrectableComputeErrorsTile0 = 6u;
7476
constexpr uint64_t initialUncorrectableDriverErrorsTile0 = 5u;
7577
constexpr uint64_t initialUncorrectableCacheErrorsTile1 = 1u;
7678
constexpr uint64_t initialEngineResetTile1 = 4u;
7779
constexpr uint64_t initialProgrammingErrorsTile1 = 5u;
7880
constexpr uint64_t initialCorrectableComputeErrorsTile1 = 7u;
79-
constexpr uint64_t initialUncorrectableNonComputeErrorsTile1 = 5u;
81+
constexpr uint64_t initialUncorrectableNonComputeErrorsTile1 = 3u;
82+
constexpr uint64_t initialUncorrectableFabricErrorsTile1 = 2u;
8083
constexpr uint64_t initialUncorrectableComputeErrorsTile1 = 6u;
8184
constexpr uint64_t initialUncorrectableDriverErrorsTile1 = 4u;
8285
constexpr uint64_t timeStamp = 1000u;
@@ -132,11 +135,11 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp {
132135
data[5] = driverGgtt;
133136
data[6] = driverRps;
134137
data[7] = 0;
135-
data[8] = 0;
136-
data[9] = fatalEuErrorCount;
137-
data[10] = socFatalMdfiEastCount;
138-
data[11] = socFatalPsfCsc0Count;
139-
data[12] = fatalTlb;
138+
data[8] = fatalEuErrorCount;
139+
data[9] = socFatalPsfCsc0Count;
140+
data[10] = fatalTlb;
141+
data[11] = 0;
142+
data[12] = socFatalMdfiEastCount;
140143
return 0;
141144
}
142145

@@ -159,14 +162,14 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp {
159162
data[5] = driverGgtt;
160163
data[6] = driverRps;
161164
data[7] = 0;
162-
data[8] = 0;
163-
data[9] = fatalSubslice;
164-
data[10] = fatalEuErrorCount;
165-
data[11] = socFatalMdfiEastCount;
166-
data[12] = socFatalPsfCsc0Count;
167-
data[13] = nonFatalGscAonParity;
168-
data[14] = nonFataGscSelfmBist;
169-
data[15] = fatalTlb;
165+
data[8] = fatalSubslice;
166+
data[9] = fatalEuErrorCount;
167+
data[10] = socFatalPsfCsc0Count;
168+
data[11] = nonFatalGscAonParity;
169+
data[12] = nonFataGscSelfmBist;
170+
data[13] = fatalTlb;
171+
data[14] = 0;
172+
data[15] = socFatalMdfiEastCount;
170173
return 0;
171174
}
172175

@@ -187,10 +190,10 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp {
187190
data[4] = driverMigration;
188191
data[5] = driverEngineOther;
189192
data[6] = fatalGucErrorCountTile1;
190-
data[7] = socFatalMdfiWestCountTile1;
191-
data[8] = socFatalPunitTile1;
192-
data[9] = fatalIdiParityErrorCountTile1;
193-
data[10] = fatalL3BankTile1;
193+
data[7] = socFatalPunitTile1;
194+
data[8] = fatalIdiParityErrorCountTile1;
195+
data[9] = fatalL3BankTile1;
196+
data[10] = socFatalMdfiWestCountTile1;
194197
return 0;
195198
}
196199

level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,10 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuc
192192
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
193193
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
194194
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors);
195-
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
195+
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
196196
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
197197
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
198+
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_L3FABRIC_ERRORS], socFatalMdfiEastCount + initialUncorrectableFabricErrors);
198199
}
199200
}
200201
}
@@ -229,9 +230,10 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtAfterCl
229230
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
230231
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
231232
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors);
232-
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
233+
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
233234
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
234235
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
236+
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_L3FABRIC_ERRORS], socFatalMdfiEastCount + initialUncorrectableFabricErrors);
235237
}
236238
}
237239
correctable = true;
@@ -655,9 +657,10 @@ TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateF
655657
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0);
656658
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0);
657659
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0);
658-
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0);
660+
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0);
659661
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
660662
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0);
663+
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_L3FABRIC_ERRORS], socFatalMdfiEastCount + initialUncorrectableFabricErrorsTile0);
661664
} else if (handleIndex == 2u) {
662665
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); // No. of correctable error type for subdevice 1
663666
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
@@ -674,6 +677,7 @@ TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateF
674677
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiWestCountTile1 + socFatalPunitTile1 + initialUncorrectableNonComputeErrorsTile1);
675678
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
676679
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverEngineOther + initialUncorrectableDriverErrorsTile1);
680+
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_L3FABRIC_ERRORS], socFatalMdfiWestCountTile1 + initialUncorrectableFabricErrorsTile1);
677681
}
678682
handleIndex++;
679683
}

0 commit comments

Comments
 (0)