Add grafana alerts

Signed-off-by: Nikolaos Karaolidis <nick@karaolidis.com>
This commit is contained in:
2025-09-15 15:34:52 +01:00
parent 310950de42
commit 4cd670bb27
10 changed files with 521 additions and 11 deletions

View File

@@ -0,0 +1,10 @@
apiVersion: 1
policies:
- orgId: 1
receiver: ntfy.sh
group_by:
- grafana_folder
- alertname
group_wait: 0s
group_interval: 1m
repeat_interval: 1h

View File

@@ -0,0 +1,406 @@
apiVersion: 1
groups:
- orgId: 1
name: Default
folder: System
interval: 10s
rules:
- uid: cpu-usage
title: CPU Usage
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: 1 - avg by(hostname) (rate(node_cpu_seconds_total{mode="idle"}[1h]))
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0.9
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 30m
keepFiringFor: 5m
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: memory-usage
title: Memory Usage
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: 1 - (node_memory_MemAvailable_bytes{} / node_memory_MemTotal_bytes{})
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0.9
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: cpu-temperature
title: CPU Temperature
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: node_hwmon_temp_celsius{chip="pci0000:00_0000:00:18_3", sensor="temp1"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 75
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 30m
keepFiringFor: 5m
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: amabient-temperature
title: Ambient Temperature
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: avg(node_hwmon_temp_celsius{chip="thermal_thermal_zone0"})
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 70
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 15m
keepFiringFor: 5m
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: smart-status
title: SMART Status
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: smartctl_device_smart_status
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 1
type: lt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
keepFiringFor: 1h
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: smart-errors
title: SMART Errors
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: "sum(\n increase(smartctl_device_attribute{attribute_value_type=\"raw\", attribute_name=~\"Raw_Read_Error_Rate|Seek_Error_Rate|Offline_Uncorrectable\"}[1h])\n) + \nsum(\n increase(smartctl_device_media_errors[1h])\n)"
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
keepFiringFor: 1h
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: smart-temperature
title: SMART Temperature
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: smartctl_device_temperature
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 50
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 1m
keepFiringFor: 5m
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: btrfs-errors
title: BTRFS Errors
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: |-
sum by (btrfs_dev_uuid) (
increase(node_btrfs_device_errors_total[1h])
)
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
keepFiringFor: 1h
isPaused: false
notification_settings:
receiver: ntfy.sh

View File

@@ -18,6 +18,7 @@ in
"grafana/authelia/password".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"grafana/authelia/digest".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"grafana/smtp".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"ntfy/tokens/jupiter/grafana".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
};
templates = {
@@ -114,6 +115,37 @@ in
};
}
);
grafana-to-ntfy-env.content = ''
BAUTH_PASS=${hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"}
NTFY_BAUTH_PASS=${hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"}
'';
grafana-contact-points.content = builtins.readFile (
(pkgs.formats.yaml { }).generate "contact-points.yaml" {
apiVersion = 1;
contactPoints = [
{
orgId = 1;
name = "ntfy.sh";
receivers = [
{
uid = "ntfy";
type = "webhook";
settings = {
httpMethod = "POST";
url = "http://grafana-to-ntfy:8080";
username = "jupiter";
password = hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana";
headers = { };
};
disableResolveMessage = false;
}
];
}
];
}
);
};
};
@@ -136,7 +168,7 @@ in
providers = [
{
name = "Default";
folder = "";
folder = "System";
type = "file";
url = "http://prometheus:9090";
options.path = "/var/lib/grafana/dashboards";
@@ -148,6 +180,9 @@ in
"${hmConfig.sops.templates.grafana.path}:/etc/grafana/grafana.ini:ro"
"${dashboards}:/etc/grafana/conf/provisioning/dashboards/default.yaml:ro"
"${./dashboards}:/var/lib/grafana/dashboards:ro"
"${./alerting/policies.yaml}:/etc/grafana/conf/provisioning/alerting/policies.yaml:ro"
"${./alerting/rules.yaml}:/etc/grafana/conf/provisioning/alerting/rules.yaml:ro"
"${hmConfig.sops.templates.grafana-contact-points.path}:/etc/grafana/conf/provisioning/alerting/contact-points.yaml:ro"
];
labels = [
"traefik.enable=true"
@@ -163,6 +198,17 @@ in
networks = [ networks.grafana.ref ];
};
grafana-to-ntfy.containerConfig = {
image = "docker-archive:${pkgs.dockerImages.grafana-to-ntfy}";
networks = [ networks.grafana.ref ];
environments = {
"NTFY_URL" = "https://ntfy.karaolidis.com/grafana";
"NTFY_BAUTH_USER" = "jupiter";
"BAUTH_USER" = "jupiter";
};
environmentFiles = [ hmConfig.sops.templates.grafana-to-ntfy-env.path ];
};
authelia.containerConfig.volumes = [
"${hmConfig.sops.templates.authelia-grafana.path}:/etc/authelia/conf.d/grafana.yaml:ro"
];

View File

@@ -17,6 +17,8 @@ in
"ntfy/webPush/publicKey".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"ntfy/webPush/privateKey".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"ntfy/users/karaolidis".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"ntfy/users/jupiter".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"ntfy/tokens/jupiter/grafana".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
};
templates = {
@@ -43,7 +45,14 @@ in
auth-default-access = "deny-all";
auth-startup-queries = dbStartupQueries;
auth-users = [ "karaolidis:${hmConfig.sops.placeholder."ntfy/users/karaolidis"}:admin" ];
auth-users = [
"jupiter:${hmConfig.sops.placeholder."ntfy/users/jupiter"}:user"
"karaolidis:${hmConfig.sops.placeholder."ntfy/users/karaolidis"}:admin"
];
auth-access = [ "jupiter:grafana:wo" ];
auth-tokens = [ "jupiter:${hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"}" ];
behind-proxy = true;