Skip to content

Commit ed9ec9a

Browse files
Fix fabric ras errors accumulated to all devices
This patch fixes the issue that fabric ras errors from all devies are reported for all devices. Related-To: LOCI-3548 Signed-off-by: Joshua Santosh Ranjan <joshua.santosh.ranjan@intel.com> Source: 7c05029
1 parent 753c723 commit ed9ec9a

File tree

2 files changed

+20
-14
lines changed

2 files changed

+20
-14
lines changed

level_zero/tools/source/sysman/ras/linux/os_ras_imp_fabric.cpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include <regex>
1717
namespace L0 {
1818

19-
void LinuxRasSourceFabric::getNodes(std::vector<std::string> &nodes, uint32_t subdeviceId, FsAccess *fsAccess, const zes_ras_error_type_t &type) {
19+
void LinuxRasSourceFabric::getNodes(std::vector<std::string> &nodes, uint32_t subdeviceId, LinuxSysmanImp *pSysmanImp, const zes_ras_error_type_t &type) {
2020
const uint32_t minBoardStrappedNumber = 0;
2121
const uint32_t maxBoardStrappedNumber = 31;
2222
const uint32_t minPortId = 1;
@@ -27,18 +27,23 @@ void LinuxRasSourceFabric::getNodes(std::vector<std::string> &nodes, uint32_t su
2727
const std::string iafPathStringAuxillary("/sys/module/iaf/drivers/auxiliary:iaf/");
2828
std::string iafPathString("");
2929

30-
if (fsAccess->directoryExists(iafPathStringMfd)) {
31-
iafPathString = iafPathStringMfd + "iaf.";
32-
} else if (fsAccess->directoryExists(iafPathStringAuxillary)) {
33-
iafPathString = iafPathStringAuxillary + "i915.iaf.";
30+
if (pSysmanImp->getSysfsAccess().getRealPath("device/", iafPathString) != ZE_RESULT_SUCCESS) {
31+
return;
32+
}
33+
34+
auto &fsAccess = pSysmanImp->getFsAccess();
35+
if (fsAccess.directoryExists(iafPathStringMfd)) {
36+
iafPathString = iafPathString + "/iaf.";
37+
} else if (fsAccess.directoryExists(iafPathStringAuxillary)) {
38+
iafPathString = iafPathString + "/i915.iaf.";
3439
} else {
3540
return;
3641
}
3742

3843
for (auto boardStrappedNumber = minBoardStrappedNumber; boardStrappedNumber <= maxBoardStrappedNumber; boardStrappedNumber++) {
3944

4045
const auto boardStrappedString(iafPathString + std::to_string(boardStrappedNumber));
41-
if (!fsAccess->directoryExists(boardStrappedString)) {
46+
if (!fsAccess.directoryExists(boardStrappedString)) {
4247
continue;
4348
}
4449
const auto subDeviceString(boardStrappedString + "/sd." + std::to_string(subdeviceId));
@@ -58,7 +63,7 @@ void LinuxRasSourceFabric::getNodes(std::vector<std::string> &nodes, uint32_t su
5863
}
5964

6065
for (auto &subDeviceErrorNode : subDeviceErrorNodes) {
61-
if (ZE_RESULT_SUCCESS == fsAccess->canRead(subDeviceErrorNode)) {
66+
if (ZE_RESULT_SUCCESS == fsAccess.canRead(subDeviceErrorNode)) {
6267
nodes.push_back(subDeviceErrorNode);
6368
}
6469
}
@@ -72,11 +77,11 @@ ze_result_t LinuxRasSourceFabric::getSupportedRasErrorTypes(std::set<zes_ras_err
7277
uint32_t subDeviceIndex = neoDevice->isSubDevice() ? static_cast<NEO::SubDevice *>(neoDevice)->getSubDeviceIndex() : 0;
7378

7479
std::vector<std::string> nodes;
75-
getNodes(nodes, subDeviceIndex, &pLinuxSysmanImp->getFsAccess(), ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
80+
getNodes(nodes, subDeviceIndex, pLinuxSysmanImp, ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
7681
if (nodes.size()) {
7782
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
7883
}
79-
getNodes(nodes, subDeviceIndex, &pLinuxSysmanImp->getFsAccess(), ZES_RAS_ERROR_TYPE_CORRECTABLE);
84+
getNodes(nodes, subDeviceIndex, pLinuxSysmanImp, ZES_RAS_ERROR_TYPE_CORRECTABLE);
8085
if (nodes.size()) {
8186
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
8287
}
@@ -86,15 +91,16 @@ ze_result_t LinuxRasSourceFabric::getSupportedRasErrorTypes(std::set<zes_ras_err
8691

8792
LinuxRasSourceFabric::LinuxRasSourceFabric(OsSysman *pOsSysman, zes_ras_error_type_t type, uint32_t subDeviceId) {
8893

89-
fsAccess = &static_cast<LinuxSysmanImp *>(pOsSysman)->getFsAccess();
90-
getNodes(errorNodes, subDeviceId, fsAccess, type);
94+
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
95+
getNodes(errorNodes, subDeviceId, pLinuxSysmanImp, type);
9196
}
9297

9398
uint64_t LinuxRasSourceFabric::getComputeErrorCount() {
9499
uint64_t currentErrorCount = 0;
100+
auto &fsAccess = pLinuxSysmanImp->getFsAccess();
95101
for (const auto &node : errorNodes) {
96102
uint64_t errorCount = 0;
97-
fsAccess->read(node, errorCount);
103+
fsAccess.read(node, errorCount);
98104
currentErrorCount += errorCount;
99105
}
100106
return currentErrorCount;

level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,11 @@ class LinuxRasSourceFabric : public LinuxRasSources {
9999
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
100100

101101
private:
102-
FsAccess *fsAccess = nullptr;
102+
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
103103
std::vector<std::string> errorNodes = {};
104104
uint64_t baseComputeErrorCount = 0;
105105
uint64_t getComputeErrorCount();
106-
static void getNodes(std::vector<std::string> &nodes, uint32_t subdeviceId, FsAccess *fsAccess, const zes_ras_error_type_t &type);
106+
static void getNodes(std::vector<std::string> &nodes, uint32_t subdeviceId, LinuxSysmanImp *pSysmanImp, const zes_ras_error_type_t &type);
107107
};
108108

109109
class LinuxRasSourceHbm : public LinuxRasSources {

0 commit comments

Comments
 (0)