added ideal and thresholds for NVMe and SCSI drives.

Added functions (PopulateAttributeStatus) to ensure that NVME and SCSI drives set the status for SMART attributes.
Moved Status populating fucntion into the *Attribute files, so they are closer to the code they actually interact with.
Fix frontend to correctly display status, thresh and Ideal for NVMe and SCSI ddrives.
This commit is contained in:
Jason Kulatunga
2020-09-08 22:24:24 -07:00
parent fb1415f8a5
commit 86fc10b7b9
9 changed files with 147 additions and 73 deletions
+12 -23
View File
@@ -1,9 +1,7 @@
package db
import (
"github.com/analogj/scrutiny/webapp/backend/pkg/metadata"
"github.com/analogj/scrutiny/webapp/backend/pkg/models/collector"
"strings"
"time"
)
@@ -130,33 +128,24 @@ func (dv *Device) SquashHistory() error {
}
func (dv *Device) ApplyMetadataRules() error {
if !dv.IsAta() {
// Scrutiny Observed thresholds not yet available for NVME or SCSI drives
// since most SMART attributes are not present and BackBlaze data not available
return nil
}
//embed metadata in the latest smart attributes object
if len(dv.SmartResults) > 0 && len(dv.SmartResults[0].AtaAttributes) > 0 {
if len(dv.SmartResults) > 0 {
for ndx, attr := range dv.SmartResults[0].AtaAttributes {
if strings.ToUpper(attr.WhenFailed) == SmartWhenFailedFailingNow {
//this attribute has previously failed
dv.SmartResults[0].AtaAttributes[ndx].Status = SmartAttributeStatusFailed
dv.SmartResults[0].AtaAttributes[ndx].StatusReason = "Attribute is failing manufacturer SMART threshold"
attr.PopulateAttributeStatus()
dv.SmartResults[0].AtaAttributes[ndx] = attr
}
} else if strings.ToUpper(attr.WhenFailed) == SmartWhenFailedInThePast {
dv.SmartResults[0].AtaAttributes[ndx].Status = SmartAttributeStatusWarning
dv.SmartResults[0].AtaAttributes[ndx].StatusReason = "Attribute has previously failed manufacturer SMART threshold"
}
for ndx, attr := range dv.SmartResults[0].NvmeAttributes {
attr.PopulateAttributeStatus()
dv.SmartResults[0].NvmeAttributes[ndx] = attr
if smartMetadata, ok := metadata.AtaMetadata[attr.AttributeId]; ok {
dv.SmartResults[0].AtaAttributes[ndx].MetadataObservedThresholdStatus(smartMetadata)
}
}
for ndx, attr := range dv.SmartResults[0].ScsiAttributes {
attr.PopulateAttributeStatus()
dv.SmartResults[0].ScsiAttributes[ndx] = attr
//check if status is blank, set to "passed"
if len(dv.SmartResults[0].AtaAttributes[ndx].Status) == 0 {
dv.SmartResults[0].AtaAttributes[ndx].Status = SmartAttributeStatusPassed
}
}
}
return nil
+31 -27
View File
@@ -29,6 +29,7 @@ type Smart struct {
ScsiAttributes []SmartScsiAttribute `json:"scsi_attributes" gorm:"foreignkey:SmartId"`
}
//Parse Collector SMART data results and create Smart object (and associated SmartAtaAttribute entries)
func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) error {
sm.DeviceWWN = wwn
sm.TestDate = time.Unix(info.LocalTime.TimeT, 0)
@@ -55,6 +56,7 @@ func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) er
return nil
}
//generate SmartAtaAttribute entries from Scrutiny Collector Smart data.
func (sm *Smart) ProcessAtaSmartInfo(info collector.SmartInfo) {
sm.AtaAttributes = []SmartAtaAttribute{}
for _, collectorAttr := range info.AtaSmartAttributes.Table {
@@ -80,41 +82,43 @@ func (sm *Smart) ProcessAtaSmartInfo(info collector.SmartInfo) {
}
}
//generate SmartNvmeAttribute entries from Scrutiny Collector Smart data.
func (sm *Smart) ProcessNvmeSmartInfo(info collector.SmartInfo) {
sm.NvmeAttributes = []SmartNvmeAttribute{
{AttributeId: "critical_warning", Name: "Critical Warning", Value: info.NvmeSmartHealthInformationLog.CriticalWarning},
{AttributeId: "temperature", Name: "Temperature", Value: info.NvmeSmartHealthInformationLog.Temperature},
{AttributeId: "critical_warning", Name: "Critical Warning", Value: info.NvmeSmartHealthInformationLog.CriticalWarning, Threshold: 0},
{AttributeId: "temperature", Name: "Temperature", Value: info.NvmeSmartHealthInformationLog.Temperature, Threshold: -1},
{AttributeId: "available_spare", Name: "Available Spare", Value: info.NvmeSmartHealthInformationLog.AvailableSpare, Threshold: info.NvmeSmartHealthInformationLog.AvailableSpareThreshold},
{AttributeId: "percentage_used", Name: "Percentage Used", Value: info.NvmeSmartHealthInformationLog.PercentageUsed, Threshold: 100},
{AttributeId: "data_units_read", Name: "Data Units Read", Value: info.NvmeSmartHealthInformationLog.DataUnitsRead},
{AttributeId: "data_units_written", Name: "Data Units Written", Value: info.NvmeSmartHealthInformationLog.DataUnitsWritten},
{AttributeId: "host_reads", Name: "Host Reads", Value: info.NvmeSmartHealthInformationLog.HostReads},
{AttributeId: "host_writes", Name: "Host Writes", Value: info.NvmeSmartHealthInformationLog.HostWrites},
{AttributeId: "controller_busy_time", Name: "Controller Busy Time", Value: info.NvmeSmartHealthInformationLog.ControllerBusyTime},
{AttributeId: "power_cycles", Name: "Power Cycles", Value: info.NvmeSmartHealthInformationLog.PowerCycles},
{AttributeId: "power_on_hours", Name: "Power on Hours", Value: info.NvmeSmartHealthInformationLog.PowerOnHours},
{AttributeId: "unsafe_shutdowns", Name: "Unsafe Shutdowns", Value: info.NvmeSmartHealthInformationLog.UnsafeShutdowns},
{AttributeId: "media_errors", Name: "Media Errors", Value: info.NvmeSmartHealthInformationLog.MediaErrors},
{AttributeId: "num_err_log_entries", Name: "Numb Err Log Entries", Value: info.NvmeSmartHealthInformationLog.NumErrLogEntries},
{AttributeId: "warning_temp_time", Name: "Warning Temp Time", Value: info.NvmeSmartHealthInformationLog.WarningTempTime},
{AttributeId: "critical_comp_time", Name: "Critical CompTime", Value: info.NvmeSmartHealthInformationLog.CriticalCompTime},
{AttributeId: "data_units_read", Name: "Data Units Read", Value: info.NvmeSmartHealthInformationLog.DataUnitsRead, Threshold: -1},
{AttributeId: "data_units_written", Name: "Data Units Written", Value: info.NvmeSmartHealthInformationLog.DataUnitsWritten, Threshold: -1},
{AttributeId: "host_reads", Name: "Host Reads", Value: info.NvmeSmartHealthInformationLog.HostReads, Threshold: -1},
{AttributeId: "host_writes", Name: "Host Writes", Value: info.NvmeSmartHealthInformationLog.HostWrites, Threshold: -1},
{AttributeId: "controller_busy_time", Name: "Controller Busy Time", Value: info.NvmeSmartHealthInformationLog.ControllerBusyTime, Threshold: -1},
{AttributeId: "power_cycles", Name: "Power Cycles", Value: info.NvmeSmartHealthInformationLog.PowerCycles, Threshold: -1},
{AttributeId: "power_on_hours", Name: "Power on Hours", Value: info.NvmeSmartHealthInformationLog.PowerOnHours, Threshold: -1},
{AttributeId: "unsafe_shutdowns", Name: "Unsafe Shutdowns", Value: info.NvmeSmartHealthInformationLog.UnsafeShutdowns, Threshold: -1},
{AttributeId: "media_errors", Name: "Media Errors", Value: info.NvmeSmartHealthInformationLog.MediaErrors, Threshold: 0},
{AttributeId: "num_err_log_entries", Name: "Numb Err Log Entries", Value: info.NvmeSmartHealthInformationLog.NumErrLogEntries, Threshold: 0},
{AttributeId: "warning_temp_time", Name: "Warning Temp Time", Value: info.NvmeSmartHealthInformationLog.WarningTempTime, Threshold: -1},
{AttributeId: "critical_comp_time", Name: "Critical CompTime", Value: info.NvmeSmartHealthInformationLog.CriticalCompTime, Threshold: -1},
}
}
//generate SmartScsiAttribute entries from Scrutiny Collector Smart data.
func (sm *Smart) ProcessScsiSmartInfo(info collector.SmartInfo) {
sm.ScsiAttributes = []SmartScsiAttribute{
{AttributeId: "scsi_grown_defect_list", Name: "Grown Defect List", Value: info.ScsiGrownDefectList},
{AttributeId: "read.errors_corrected_by_eccfast", Name: "Read Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast},
{AttributeId: "read.errors_corrected_by_eccdelayed", Name: "Read Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed},
{AttributeId: "read.errors_corrected_by_rereads_rewrites", Name: "Read Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites},
{AttributeId: "read.total_errors_corrected", Name: "Read Total Errors Corrected", Value: info.ScsiErrorCounterLog.Read.TotalErrorsCorrected},
{AttributeId: "read.correction_algorithm_invocations", Name: "Read Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations},
{AttributeId: "read.total_uncorrected_errors", Name: "Read Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Read.TotalUncorrectedErrors},
{AttributeId: "write.errors_corrected_by_eccfast", Name: "Write Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast},
{AttributeId: "write.errors_corrected_by_eccdelayed", Name: "Write Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed},
{AttributeId: "write.errors_corrected_by_rereads_rewrites", Name: "Write Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites},
{AttributeId: "write.total_errors_corrected", Name: "Write Total Errors Corrected", Value: info.ScsiErrorCounterLog.Write.TotalErrorsCorrected},
{AttributeId: "write.correction_algorithm_invocations", Name: "Write Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations},
{AttributeId: "write.total_uncorrected_errors", Name: "Write Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Write.TotalUncorrectedErrors},
{AttributeId: "scsi_grown_defect_list", Name: "Grown Defect List", Value: info.ScsiGrownDefectList, Threshold: 0},
{AttributeId: "read.errors_corrected_by_eccfast", Name: "Read Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast, Threshold: -1},
{AttributeId: "read.errors_corrected_by_eccdelayed", Name: "Read Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed, Threshold: -1},
{AttributeId: "read.errors_corrected_by_rereads_rewrites", Name: "Read Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites, Threshold: 0},
{AttributeId: "read.total_errors_corrected", Name: "Read Total Errors Corrected", Value: info.ScsiErrorCounterLog.Read.TotalErrorsCorrected, Threshold: -1},
{AttributeId: "read.correction_algorithm_invocations", Name: "Read Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations, Threshold: -1},
{AttributeId: "read.total_uncorrected_errors", Name: "Read Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Read.TotalUncorrectedErrors, Threshold: 0},
{AttributeId: "write.errors_corrected_by_eccfast", Name: "Write Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast, Threshold: -1},
{AttributeId: "write.errors_corrected_by_eccdelayed", Name: "Write Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed, Threshold: -1},
{AttributeId: "write.errors_corrected_by_rereads_rewrites", Name: "Write Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites, Threshold: 0},
{AttributeId: "write.total_errors_corrected", Name: "Write Total Errors Corrected", Value: info.ScsiErrorCounterLog.Write.TotalErrorsCorrected, Threshold: -1},
{AttributeId: "write.correction_algorithm_invocations", Name: "Write Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations, Threshold: -1},
{AttributeId: "write.total_uncorrected_errors", Name: "Write Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Write.TotalUncorrectedErrors, Threshold: 0},
}
}
@@ -3,6 +3,7 @@ package db
import (
"github.com/analogj/scrutiny/webapp/backend/pkg/metadata"
"github.com/jinzhu/gorm"
"strings"
)
const SmartAttributeStatusPassed = "passed"
@@ -31,6 +32,28 @@ type SmartAtaAttribute struct {
History []SmartAtaAttribute `gorm:"-" json:"history,omitempty"`
}
//populate attribute status, using SMART Thresholds & Observed Metadata
func (sa *SmartAtaAttribute) PopulateAttributeStatus() {
if strings.ToUpper(sa.WhenFailed) == SmartWhenFailedFailingNow {
//this attribute has previously failed
sa.Status = SmartAttributeStatusFailed
sa.StatusReason = "Attribute is failing manufacturer SMART threshold"
} else if strings.ToUpper(sa.WhenFailed) == SmartWhenFailedInThePast {
sa.Status = SmartAttributeStatusWarning
sa.StatusReason = "Attribute has previously failed manufacturer SMART threshold"
}
if smartMetadata, ok := metadata.AtaMetadata[sa.AttributeId]; ok {
sa.MetadataObservedThresholdStatus(smartMetadata)
}
//check if status is blank, set to "passed"
if len(sa.Status) == 0 {
sa.Status = SmartAttributeStatusPassed
}
}
// compare the attribute (raw, normalized, transformed) value to observed thresholds, and update status if necessary
func (sa *SmartAtaAttribute) MetadataObservedThresholdStatus(smartMetadata metadata.AtaAttributeMetadata) {
//TODO: multiple rules
@@ -1,6 +1,9 @@
package db
import "github.com/jinzhu/gorm"
import (
"github.com/analogj/scrutiny/webapp/backend/pkg/metadata"
"github.com/jinzhu/gorm"
)
type SmartNvmeAttribute struct {
gorm.Model
@@ -19,3 +22,25 @@ type SmartNvmeAttribute struct {
FailureRate float64 `gorm:"-" json:"failure_rate,omitempty"`
History []SmartNvmeAttribute `gorm:"-" json:"history,omitempty"`
}
//populate attribute status, using SMART Thresholds & Observed Metadata
func (sa *SmartNvmeAttribute) PopulateAttributeStatus() {
//-1 is a special number meaning no threshold.
if sa.Threshold != -1 {
if smartMetadata, ok := metadata.NmveMetadata[sa.AttributeId]; ok {
//check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold
if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) ||
(smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) {
sa.Status = SmartAttributeStatusFailed
sa.StatusReason = "Attribute is failing recommended SMART threshold"
}
}
}
//TODO: eventually figure out the critical_warning bits and determine correct error messages here.
//check if status is blank, set to "passed"
if len(sa.Status) == 0 {
sa.Status = SmartAttributeStatusPassed
}
}
@@ -1,6 +1,9 @@
package db
import "github.com/jinzhu/gorm"
import (
"github.com/analogj/scrutiny/webapp/backend/pkg/metadata"
"github.com/jinzhu/gorm"
)
type SmartScsiAttribute struct {
gorm.Model
@@ -19,3 +22,24 @@ type SmartScsiAttribute struct {
FailureRate float64 `gorm:"-" json:"failure_rate,omitempty"`
History []SmartScsiAttribute `gorm:"-" json:"history,omitempty"`
}
//populate attribute status, using SMART Thresholds & Observed Metadata
func (sa *SmartScsiAttribute) PopulateAttributeStatus() {
//-1 is a special number meaning no threshold.
if sa.Threshold != -1 {
if smartMetadata, ok := metadata.NmveMetadata[sa.AttributeId]; ok {
//check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold
if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) ||
(smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) {
sa.Status = SmartAttributeStatusFailed
sa.StatusReason = "Attribute is failing recommended SMART threshold"
}
}
}
//check if status is blank, set to "passed"
if len(sa.Status) == 0 {
sa.Status = SmartAttributeStatusPassed
}
}