diff --git a/nagios/values.yaml b/nagios/values.yaml index 212d007fa4..d98cbb6cc4 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -378,6 +378,12 @@ conf: service_description: "Deployment_replicas-unavailable" check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas check_interval: 60 + - check_volume_claim_high_utilization: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Volume_claim_high_utilization" + check_command: check_prom_alert!volume_claim_capacity_high_utilization!CRITICAL- Volume claim {persistentvolumeclaim} has exceed 80% utilization!OK- All volume claims less than 80% utilization + check_interval: 60 - check_deployment_rollingupdate_replicas_unavailable: use: notifying_service hostgroup_name: prometheus-hosts diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 0c1ae2909f..7fc98bf911 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -900,6 +900,14 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' + - alert: volume_claim_capacity_high_utilization + expr: (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80 + for: 5m + labels: + severity: page + annotations: + description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity' + summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.' basic_linux: groups: - name: basic_linux.rules