WIP - start supporting NVME and SCSI drive smart data.

Added addiitonal log message data for Smartctl exit ccode.
This commit is contained in:
Jason Kulatunga
2020-08-24 23:01:03 -07:00
parent 794479b28e
commit 5a80ae3e74
14 changed files with 2022 additions and 125 deletions
+58 -88
View File
@@ -24,7 +24,9 @@ type Smart struct {
PowerOnHours int64 `json:"power_on_hours"`
PowerCycleCount int64 `json:"power_cycle_count"`
SmartAttributes []SmartAttribute `json:"smart_attributes" gorm:"foreignkey:SmartId"`
AtaAttributes []SmartAtaAttribute `json:"ata_attributes" gorm:"foreignkey:SmartId"`
NvmeAttributes []SmartNvmeAttribute `json:"nvme_attributes" gorm:"foreignkey:SmartId"`
ScsiAttributes []SmartScsiAttribute `json:"scsi_attributes" gorm:"foreignkey:SmartId"`
}
func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) error {
@@ -36,9 +38,27 @@ func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) er
sm.PowerCycleCount = info.PowerCycleCount
sm.PowerOnHours = info.PowerOnTime.Hours
sm.SmartAttributes = []SmartAttribute{}
// process ATA/NVME/SCSI protocol data
if info.Device.Protocol == DeviceProtocolAta {
sm.ProcessAtaSmartInfo(info)
} else if info.Device.Protocol == DeviceProtocolNvme {
sm.ProcessNvmeSmartInfo(info)
} else if info.Device.Protocol == DeviceProtocolScsi {
sm.ProcessScsiSmartInfo(info)
}
if info.SmartStatus.Passed {
sm.SmartStatus = "passed"
} else {
sm.SmartStatus = "failed"
}
return nil
}
func (sm *Smart) ProcessAtaSmartInfo(info collector.SmartInfo) {
sm.AtaAttributes = []SmartAtaAttribute{}
for _, collectorAttr := range info.AtaSmartAttributes.Table {
attrModel := SmartAttribute{
attrModel := SmartAtaAttribute{
AttributeId: collectorAttr.ID,
Name: collectorAttr.Name,
Value: collectorAttr.Value,
@@ -56,95 +76,45 @@ func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) er
attrModel.TransformedValue = smartMetadata.Transform(attrModel.Value, attrModel.RawValue, attrModel.RawString)
}
}
sm.SmartAttributes = append(sm.SmartAttributes, attrModel)
sm.AtaAttributes = append(sm.AtaAttributes, attrModel)
}
if info.SmartStatus.Passed {
sm.SmartStatus = "passed"
} else {
sm.SmartStatus = "failed"
}
return nil
}
const SmartAttributeStatusPassed = "passed"
const SmartAttributeStatusFailed = "failed"
const SmartAttributeStatusWarning = "warn"
type SmartAttribute struct {
gorm.Model
SmartId int `json:"smart_id"`
Smart Device `json:"-" gorm:"foreignkey:SmartId"` // use SmartId as foreign key
AttributeId int `json:"attribute_id"`
Name string `json:"name"`
Value int `json:"value"`
Worst int `json:"worst"`
Threshold int `json:"thresh"`
RawValue int64 `json:"raw_value"`
RawString string `json:"raw_string"`
WhenFailed string `json:"when_failed"`
TransformedValue int64 `json:"transformed_value"`
Status string `gorm:"-" json:"status,omitempty"`
StatusReason string `gorm:"-" json:"status_reason,omitempty"`
FailureRate float64 `gorm:"-" json:"failure_rate,omitempty"`
History []SmartAttribute `gorm:"-" json:"history,omitempty"`
func (sm *Smart) ProcessNvmeSmartInfo(info collector.SmartInfo) {
sm.NvmeAttributes = []SmartNvmeAttribute{
{AttributeId: "critical_warning", Name: "Critical Warning", Value: info.NvmeSmartHealthInformationLog.CriticalWarning},
{AttributeId: "temperature", Name: "Temperature", Value: info.NvmeSmartHealthInformationLog.Temperature},
{AttributeId: "available_spare", Name: "Available Spare", Value: info.NvmeSmartHealthInformationLog.AvailableSpare, Threshold: info.NvmeSmartHealthInformationLog.AvailableSpareThreshold},
{AttributeId: "percentage_used", Name: "Percentage Used", Value: info.NvmeSmartHealthInformationLog.PercentageUsed},
{AttributeId: "data_units_read", Name: "Data Units Read", Value: info.NvmeSmartHealthInformationLog.DataUnitsRead},
{AttributeId: "data_units_written", Name: "Data Units Written", Value: info.NvmeSmartHealthInformationLog.DataUnitsWritten},
{AttributeId: "host_reads", Name: "Host Reads", Value: info.NvmeSmartHealthInformationLog.HostReads},
{AttributeId: "host_writes", Name: "Host Writes", Value: info.NvmeSmartHealthInformationLog.HostWrites},
{AttributeId: "controller_busy_time", Name: "Controller Busy Time", Value: info.NvmeSmartHealthInformationLog.ControllerBusyTime},
{AttributeId: "power_cycles", Name: "Power Cycles", Value: info.NvmeSmartHealthInformationLog.PowerCycles},
{AttributeId: "power_on_hours", Name: "Power on Hours", Value: info.NvmeSmartHealthInformationLog.PowerOnHours},
{AttributeId: "unsafe_shutdowns", Name: "Unsafe Shutdowns", Value: info.NvmeSmartHealthInformationLog.UnsafeShutdowns},
{AttributeId: "media_errors", Name: "Media Errors", Value: info.NvmeSmartHealthInformationLog.MediaErrors},
{AttributeId: "num_err_log_entries", Name: "Numb Err Log Entries", Value: info.NvmeSmartHealthInformationLog.NumErrLogEntries},
{AttributeId: "warning_temp_time", Name: "Warning Temp Time", Value: info.NvmeSmartHealthInformationLog.WarningTempTime},
{AttributeId: "critical_comp_time", Name: "Critical CompTime", Value: info.NvmeSmartHealthInformationLog.CriticalCompTime},
}
}
// compare the attribute (raw, normalized, transformed) value to observed thresholds, and update status if necessary
func (sa *SmartAttribute) MetadataObservedThresholdStatus(smartMetadata metadata.AtaSmartAttribute) {
//TODO: multiple rules
// try to predict the failure rates for observed thresholds that have 0 failure rate and error bars.
// - if the attribute is critical
// - the failure rate is over 10 - set to failed
// - the attribute does not match any threshold, set to warn
// - if the attribute is not critical
// - if failure rate is above 20 - set to failed
// - if failure rate is above 10 but below 20 - set to warn
//update the smart attribute status based on Observed thresholds.
var value int64
if smartMetadata.DisplayType == metadata.AtaSmartAttributeDisplayTypeNormalized {
value = int64(sa.Value)
} else if smartMetadata.DisplayType == metadata.AtaSmartAttributeDisplayTypeTransformed {
value = sa.TransformedValue
} else {
value = sa.RawValue
func (sm *Smart) ProcessScsiSmartInfo(info collector.SmartInfo) {
sm.ScsiAttributes = []SmartScsiAttribute{
{AttributeId: "scsi_grown_defect_list", Name: "Grown Defect List", Value: info.ScsiGrownDefectList},
{AttributeId: "read.errors_corrected_by_eccfast", Name: "Read Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast},
{AttributeId: "read.errors_corrected_by_eccdelayed", Name: "Read Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed},
{AttributeId: "read.errors_corrected_by_rereads_rewrites", Name: "Read Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites},
{AttributeId: "read.total_errors_corrected", Name: "Read Total Errors Corrected", Value: info.ScsiErrorCounterLog.Read.TotalErrorsCorrected},
{AttributeId: "read.correction_algorithm_invocations", Name: "Read Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations},
{AttributeId: "read.total_uncorrected_errors", Name: "Read Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Read.TotalUncorrectedErrors},
{AttributeId: "write.errors_corrected_by_eccfast", Name: "Write Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast},
{AttributeId: "write.errors_corrected_by_eccdelayed", Name: "Write Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed},
{AttributeId: "write.errors_corrected_by_rereads_rewrites", Name: "Write Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites},
{AttributeId: "write.total_errors_corrected", Name: "Write Total Errors Corrected", Value: info.ScsiErrorCounterLog.Write.TotalErrorsCorrected},
{AttributeId: "write.correction_algorithm_invocations", Name: "Write Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations},
{AttributeId: "write.total_uncorrected_errors", Name: "Write Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Write.TotalUncorrectedErrors},
}
for _, obsThresh := range smartMetadata.ObservedThresholds {
//check if "value" is in this bucket
if ((obsThresh.Low == obsThresh.High) && value == obsThresh.Low) ||
(obsThresh.Low < value && value <= obsThresh.High) {
sa.FailureRate = obsThresh.AnnualFailureRate
if smartMetadata.Critical {
if obsThresh.AnnualFailureRate >= 0.10 {
sa.Status = SmartAttributeStatusFailed
sa.StatusReason = "Observed Failure Rate for Critical Attribute is greater than 10%"
}
} else {
if obsThresh.AnnualFailureRate >= 0.20 {
sa.Status = SmartAttributeStatusFailed
sa.StatusReason = "Observed Failure Rate for Attribute is greater than 20%"
} else if obsThresh.AnnualFailureRate >= 0.10 {
sa.Status = SmartAttributeStatusWarning
sa.StatusReason = "Observed Failure Rate for Attribute is greater than 10%"
}
}
//we've found the correct bucket, we can drop out of this loop
return
}
}
// no bucket found
if smartMetadata.Critical {
sa.Status = SmartAttributeStatusWarning
sa.StatusReason = "Could not determine Observed Failure Rate for Critical Attribute"
}
return
}