WIP - start supporting NVME and SCSI drive smart data.
Added addiitonal log message data for Smartctl exit ccode.
This commit is contained in:
@@ -24,7 +24,9 @@ type Smart struct {
|
||||
PowerOnHours int64 `json:"power_on_hours"`
|
||||
PowerCycleCount int64 `json:"power_cycle_count"`
|
||||
|
||||
SmartAttributes []SmartAttribute `json:"smart_attributes" gorm:"foreignkey:SmartId"`
|
||||
AtaAttributes []SmartAtaAttribute `json:"ata_attributes" gorm:"foreignkey:SmartId"`
|
||||
NvmeAttributes []SmartNvmeAttribute `json:"nvme_attributes" gorm:"foreignkey:SmartId"`
|
||||
ScsiAttributes []SmartScsiAttribute `json:"scsi_attributes" gorm:"foreignkey:SmartId"`
|
||||
}
|
||||
|
||||
func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) error {
|
||||
@@ -36,9 +38,27 @@ func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) er
|
||||
sm.PowerCycleCount = info.PowerCycleCount
|
||||
sm.PowerOnHours = info.PowerOnTime.Hours
|
||||
|
||||
sm.SmartAttributes = []SmartAttribute{}
|
||||
// process ATA/NVME/SCSI protocol data
|
||||
if info.Device.Protocol == DeviceProtocolAta {
|
||||
sm.ProcessAtaSmartInfo(info)
|
||||
} else if info.Device.Protocol == DeviceProtocolNvme {
|
||||
sm.ProcessNvmeSmartInfo(info)
|
||||
} else if info.Device.Protocol == DeviceProtocolScsi {
|
||||
sm.ProcessScsiSmartInfo(info)
|
||||
}
|
||||
|
||||
if info.SmartStatus.Passed {
|
||||
sm.SmartStatus = "passed"
|
||||
} else {
|
||||
sm.SmartStatus = "failed"
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sm *Smart) ProcessAtaSmartInfo(info collector.SmartInfo) {
|
||||
sm.AtaAttributes = []SmartAtaAttribute{}
|
||||
for _, collectorAttr := range info.AtaSmartAttributes.Table {
|
||||
attrModel := SmartAttribute{
|
||||
attrModel := SmartAtaAttribute{
|
||||
AttributeId: collectorAttr.ID,
|
||||
Name: collectorAttr.Name,
|
||||
Value: collectorAttr.Value,
|
||||
@@ -56,95 +76,45 @@ func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) er
|
||||
attrModel.TransformedValue = smartMetadata.Transform(attrModel.Value, attrModel.RawValue, attrModel.RawString)
|
||||
}
|
||||
}
|
||||
sm.SmartAttributes = append(sm.SmartAttributes, attrModel)
|
||||
sm.AtaAttributes = append(sm.AtaAttributes, attrModel)
|
||||
}
|
||||
|
||||
if info.SmartStatus.Passed {
|
||||
sm.SmartStatus = "passed"
|
||||
} else {
|
||||
sm.SmartStatus = "failed"
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
const SmartAttributeStatusPassed = "passed"
|
||||
const SmartAttributeStatusFailed = "failed"
|
||||
const SmartAttributeStatusWarning = "warn"
|
||||
|
||||
type SmartAttribute struct {
|
||||
gorm.Model
|
||||
|
||||
SmartId int `json:"smart_id"`
|
||||
Smart Device `json:"-" gorm:"foreignkey:SmartId"` // use SmartId as foreign key
|
||||
|
||||
AttributeId int `json:"attribute_id"`
|
||||
Name string `json:"name"`
|
||||
Value int `json:"value"`
|
||||
Worst int `json:"worst"`
|
||||
Threshold int `json:"thresh"`
|
||||
RawValue int64 `json:"raw_value"`
|
||||
RawString string `json:"raw_string"`
|
||||
WhenFailed string `json:"when_failed"`
|
||||
|
||||
TransformedValue int64 `json:"transformed_value"`
|
||||
Status string `gorm:"-" json:"status,omitempty"`
|
||||
StatusReason string `gorm:"-" json:"status_reason,omitempty"`
|
||||
FailureRate float64 `gorm:"-" json:"failure_rate,omitempty"`
|
||||
History []SmartAttribute `gorm:"-" json:"history,omitempty"`
|
||||
func (sm *Smart) ProcessNvmeSmartInfo(info collector.SmartInfo) {
|
||||
sm.NvmeAttributes = []SmartNvmeAttribute{
|
||||
{AttributeId: "critical_warning", Name: "Critical Warning", Value: info.NvmeSmartHealthInformationLog.CriticalWarning},
|
||||
{AttributeId: "temperature", Name: "Temperature", Value: info.NvmeSmartHealthInformationLog.Temperature},
|
||||
{AttributeId: "available_spare", Name: "Available Spare", Value: info.NvmeSmartHealthInformationLog.AvailableSpare, Threshold: info.NvmeSmartHealthInformationLog.AvailableSpareThreshold},
|
||||
{AttributeId: "percentage_used", Name: "Percentage Used", Value: info.NvmeSmartHealthInformationLog.PercentageUsed},
|
||||
{AttributeId: "data_units_read", Name: "Data Units Read", Value: info.NvmeSmartHealthInformationLog.DataUnitsRead},
|
||||
{AttributeId: "data_units_written", Name: "Data Units Written", Value: info.NvmeSmartHealthInformationLog.DataUnitsWritten},
|
||||
{AttributeId: "host_reads", Name: "Host Reads", Value: info.NvmeSmartHealthInformationLog.HostReads},
|
||||
{AttributeId: "host_writes", Name: "Host Writes", Value: info.NvmeSmartHealthInformationLog.HostWrites},
|
||||
{AttributeId: "controller_busy_time", Name: "Controller Busy Time", Value: info.NvmeSmartHealthInformationLog.ControllerBusyTime},
|
||||
{AttributeId: "power_cycles", Name: "Power Cycles", Value: info.NvmeSmartHealthInformationLog.PowerCycles},
|
||||
{AttributeId: "power_on_hours", Name: "Power on Hours", Value: info.NvmeSmartHealthInformationLog.PowerOnHours},
|
||||
{AttributeId: "unsafe_shutdowns", Name: "Unsafe Shutdowns", Value: info.NvmeSmartHealthInformationLog.UnsafeShutdowns},
|
||||
{AttributeId: "media_errors", Name: "Media Errors", Value: info.NvmeSmartHealthInformationLog.MediaErrors},
|
||||
{AttributeId: "num_err_log_entries", Name: "Numb Err Log Entries", Value: info.NvmeSmartHealthInformationLog.NumErrLogEntries},
|
||||
{AttributeId: "warning_temp_time", Name: "Warning Temp Time", Value: info.NvmeSmartHealthInformationLog.WarningTempTime},
|
||||
{AttributeId: "critical_comp_time", Name: "Critical CompTime", Value: info.NvmeSmartHealthInformationLog.CriticalCompTime},
|
||||
}
|
||||
}
|
||||
|
||||
// compare the attribute (raw, normalized, transformed) value to observed thresholds, and update status if necessary
|
||||
func (sa *SmartAttribute) MetadataObservedThresholdStatus(smartMetadata metadata.AtaSmartAttribute) {
|
||||
//TODO: multiple rules
|
||||
// try to predict the failure rates for observed thresholds that have 0 failure rate and error bars.
|
||||
// - if the attribute is critical
|
||||
// - the failure rate is over 10 - set to failed
|
||||
// - the attribute does not match any threshold, set to warn
|
||||
// - if the attribute is not critical
|
||||
// - if failure rate is above 20 - set to failed
|
||||
// - if failure rate is above 10 but below 20 - set to warn
|
||||
|
||||
//update the smart attribute status based on Observed thresholds.
|
||||
var value int64
|
||||
if smartMetadata.DisplayType == metadata.AtaSmartAttributeDisplayTypeNormalized {
|
||||
value = int64(sa.Value)
|
||||
} else if smartMetadata.DisplayType == metadata.AtaSmartAttributeDisplayTypeTransformed {
|
||||
value = sa.TransformedValue
|
||||
} else {
|
||||
value = sa.RawValue
|
||||
func (sm *Smart) ProcessScsiSmartInfo(info collector.SmartInfo) {
|
||||
sm.ScsiAttributes = []SmartScsiAttribute{
|
||||
{AttributeId: "scsi_grown_defect_list", Name: "Grown Defect List", Value: info.ScsiGrownDefectList},
|
||||
{AttributeId: "read.errors_corrected_by_eccfast", Name: "Read Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast},
|
||||
{AttributeId: "read.errors_corrected_by_eccdelayed", Name: "Read Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed},
|
||||
{AttributeId: "read.errors_corrected_by_rereads_rewrites", Name: "Read Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites},
|
||||
{AttributeId: "read.total_errors_corrected", Name: "Read Total Errors Corrected", Value: info.ScsiErrorCounterLog.Read.TotalErrorsCorrected},
|
||||
{AttributeId: "read.correction_algorithm_invocations", Name: "Read Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations},
|
||||
{AttributeId: "read.total_uncorrected_errors", Name: "Read Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Read.TotalUncorrectedErrors},
|
||||
{AttributeId: "write.errors_corrected_by_eccfast", Name: "Write Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast},
|
||||
{AttributeId: "write.errors_corrected_by_eccdelayed", Name: "Write Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed},
|
||||
{AttributeId: "write.errors_corrected_by_rereads_rewrites", Name: "Write Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites},
|
||||
{AttributeId: "write.total_errors_corrected", Name: "Write Total Errors Corrected", Value: info.ScsiErrorCounterLog.Write.TotalErrorsCorrected},
|
||||
{AttributeId: "write.correction_algorithm_invocations", Name: "Write Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations},
|
||||
{AttributeId: "write.total_uncorrected_errors", Name: "Write Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Write.TotalUncorrectedErrors},
|
||||
}
|
||||
|
||||
for _, obsThresh := range smartMetadata.ObservedThresholds {
|
||||
|
||||
//check if "value" is in this bucket
|
||||
if ((obsThresh.Low == obsThresh.High) && value == obsThresh.Low) ||
|
||||
(obsThresh.Low < value && value <= obsThresh.High) {
|
||||
sa.FailureRate = obsThresh.AnnualFailureRate
|
||||
|
||||
if smartMetadata.Critical {
|
||||
if obsThresh.AnnualFailureRate >= 0.10 {
|
||||
sa.Status = SmartAttributeStatusFailed
|
||||
sa.StatusReason = "Observed Failure Rate for Critical Attribute is greater than 10%"
|
||||
}
|
||||
} else {
|
||||
if obsThresh.AnnualFailureRate >= 0.20 {
|
||||
sa.Status = SmartAttributeStatusFailed
|
||||
sa.StatusReason = "Observed Failure Rate for Attribute is greater than 20%"
|
||||
} else if obsThresh.AnnualFailureRate >= 0.10 {
|
||||
sa.Status = SmartAttributeStatusWarning
|
||||
sa.StatusReason = "Observed Failure Rate for Attribute is greater than 10%"
|
||||
}
|
||||
}
|
||||
|
||||
//we've found the correct bucket, we can drop out of this loop
|
||||
return
|
||||
}
|
||||
}
|
||||
// no bucket found
|
||||
if smartMetadata.Critical {
|
||||
sa.Status = SmartAttributeStatusWarning
|
||||
sa.StatusReason = "Could not determine Observed Failure Rate for Critical Attribute"
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user