Add grafana alerts

Signed-off-by: Nikolaos Karaolidis <nick@karaolidis.com>
This commit is contained in:
2025-09-15 15:34:52 +01:00
parent 310950de42
commit 4cd670bb27
10 changed files with 521 additions and 11 deletions

8
flake.lock generated
View File

@@ -511,11 +511,11 @@
"secrets": {
"flake": false,
"locked": {
"lastModified": 1757873556,
"narHash": "sha256-WYrV46if1XsiQKOQEMNtHdAPeFDeu7YBdcoNSXc3sf8=",
"lastModified": 1757935242,
"narHash": "sha256-es2Uy3QsHtQieTpqSGtU7AePvxkJdLuwla9DcRTbQn4=",
"ref": "refs/heads/main",
"rev": "21ab0b0a59264b1da501f90725bf2c03e07ae941",
"revCount": 43,
"rev": "4da39113b1b5496b4351a3594c0f6efdbf0a7acf",
"revCount": 44,
"type": "git",
"url": "ssh://git@karaolidis.com/karaolidis/nix-secrets.git"
},

View File

@@ -0,0 +1,10 @@
apiVersion: 1
policies:
- orgId: 1
receiver: ntfy.sh
group_by:
- grafana_folder
- alertname
group_wait: 0s
group_interval: 1m
repeat_interval: 1h

View File

@@ -0,0 +1,406 @@
apiVersion: 1
groups:
- orgId: 1
name: Default
folder: System
interval: 10s
rules:
- uid: cpu-usage
title: CPU Usage
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: 1 - avg by(hostname) (rate(node_cpu_seconds_total{mode="idle"}[1h]))
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0.9
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 30m
keepFiringFor: 5m
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: memory-usage
title: Memory Usage
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: 1 - (node_memory_MemAvailable_bytes{} / node_memory_MemTotal_bytes{})
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0.9
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: cpu-temperature
title: CPU Temperature
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: node_hwmon_temp_celsius{chip="pci0000:00_0000:00:18_3", sensor="temp1"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 75
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 30m
keepFiringFor: 5m
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: amabient-temperature
title: Ambient Temperature
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: avg(node_hwmon_temp_celsius{chip="thermal_thermal_zone0"})
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 70
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 15m
keepFiringFor: 5m
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: smart-status
title: SMART Status
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: smartctl_device_smart_status
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 1
type: lt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
keepFiringFor: 1h
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: smart-errors
title: SMART Errors
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: "sum(\n increase(smartctl_device_attribute{attribute_value_type=\"raw\", attribute_name=~\"Raw_Read_Error_Rate|Seek_Error_Rate|Offline_Uncorrectable\"}[1h])\n) + \nsum(\n increase(smartctl_device_media_errors[1h])\n)"
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
keepFiringFor: 1h
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: smart-temperature
title: SMART Temperature
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: smartctl_device_temperature
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 50
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 1m
keepFiringFor: 5m
isPaused: false
notification_settings:
receiver: ntfy.sh
- uid: btrfs-errors
title: BTRFS Errors
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: |-
sum by (btrfs_dev_uuid) (
increase(node_btrfs_device_errors_total[1h])
)
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
keepFiringFor: 1h
isPaused: false
notification_settings:
receiver: ntfy.sh

View File

@@ -18,6 +18,7 @@ in
"grafana/authelia/password".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"grafana/authelia/digest".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"grafana/smtp".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"ntfy/tokens/jupiter/grafana".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
};
templates = {
@@ -114,6 +115,37 @@ in
};
}
);
grafana-to-ntfy-env.content = ''
BAUTH_PASS=${hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"}
NTFY_BAUTH_PASS=${hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"}
'';
grafana-contact-points.content = builtins.readFile (
(pkgs.formats.yaml { }).generate "contact-points.yaml" {
apiVersion = 1;
contactPoints = [
{
orgId = 1;
name = "ntfy.sh";
receivers = [
{
uid = "ntfy";
type = "webhook";
settings = {
httpMethod = "POST";
url = "http://grafana-to-ntfy:8080";
username = "jupiter";
password = hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana";
headers = { };
};
disableResolveMessage = false;
}
];
}
];
}
);
};
};
@@ -136,7 +168,7 @@ in
providers = [
{
name = "Default";
folder = "";
folder = "System";
type = "file";
url = "http://prometheus:9090";
options.path = "/var/lib/grafana/dashboards";
@@ -148,6 +180,9 @@ in
"${hmConfig.sops.templates.grafana.path}:/etc/grafana/grafana.ini:ro"
"${dashboards}:/etc/grafana/conf/provisioning/dashboards/default.yaml:ro"
"${./dashboards}:/var/lib/grafana/dashboards:ro"
"${./alerting/policies.yaml}:/etc/grafana/conf/provisioning/alerting/policies.yaml:ro"
"${./alerting/rules.yaml}:/etc/grafana/conf/provisioning/alerting/rules.yaml:ro"
"${hmConfig.sops.templates.grafana-contact-points.path}:/etc/grafana/conf/provisioning/alerting/contact-points.yaml:ro"
];
labels = [
"traefik.enable=true"
@@ -163,6 +198,17 @@ in
networks = [ networks.grafana.ref ];
};
grafana-to-ntfy.containerConfig = {
image = "docker-archive:${pkgs.dockerImages.grafana-to-ntfy}";
networks = [ networks.grafana.ref ];
environments = {
"NTFY_URL" = "https://ntfy.karaolidis.com/grafana";
"NTFY_BAUTH_USER" = "jupiter";
"BAUTH_USER" = "jupiter";
};
environmentFiles = [ hmConfig.sops.templates.grafana-to-ntfy-env.path ];
};
authelia.containerConfig.volumes = [
"${hmConfig.sops.templates.authelia-grafana.path}:/etc/authelia/conf.d/grafana.yaml:ro"
];

View File

@@ -17,6 +17,8 @@ in
"ntfy/webPush/publicKey".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"ntfy/webPush/privateKey".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"ntfy/users/karaolidis".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"ntfy/users/jupiter".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
"ntfy/tokens/jupiter/grafana".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
};
templates = {
@@ -43,7 +45,14 @@ in
auth-default-access = "deny-all";
auth-startup-queries = dbStartupQueries;
auth-users = [ "karaolidis:${hmConfig.sops.placeholder."ntfy/users/karaolidis"}:admin" ];
auth-users = [
"jupiter:${hmConfig.sops.placeholder."ntfy/users/jupiter"}:user"
"karaolidis:${hmConfig.sops.placeholder."ntfy/users/karaolidis"}:admin"
];
auth-access = [ "jupiter:grafana:wo" ];
auth-tokens = [ "jupiter:${hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"}" ];
behind-proxy = true;

View File

@@ -21,8 +21,9 @@ final: prev:
gitea = final.docker-image-gitea;
gitea-act-runner = final.docker-image-gitea-act-runner;
gitea-act-runner-worker = final.docker-image-gitea-act-runner-worker;
grafana = final.docker-image-grafana;
grafana-image-renderer = final.docker-image-grafana-image-renderer;
grafana-to-ntfy = final.docker-image-grafana-to-ntfy;
grafana = final.docker-image-grafana;
jellyfin = final.docker-image-jellyfin;
jellyseerr = final.docker-image-jellyseerr;
littlelink-server = final.docker-image-littlelink-server;

View File

@@ -14,8 +14,9 @@
docker-image-gitea = import ./docker/gitea { inherit pkgs; };
docker-image-gitea-act-runner = import ./docker/gitea-act-runner { inherit pkgs; };
docker-image-gitea-act-runner-worker = import ./docker/gitea-act-runner-worker { inherit pkgs; };
docker-image-grafana = import ./docker/grafana { inherit pkgs; };
docker-image-grafana-image-renderer = import ./docker/grafana-image-renderer { inherit pkgs; };
docker-image-grafana-to-ntfy = import ./docker/grafana-to-ntfy { inherit pkgs; };
docker-image-grafana = import ./docker/grafana { inherit pkgs; };
docker-image-jellyfin = import ./docker/jellyfin { inherit pkgs; };
docker-image-jellyseerr = import ./docker/jellyseerr { inherit pkgs; };
docker-image-littlelink-server = import ./docker/littlelink-server { inherit pkgs; };

View File

@@ -7,7 +7,7 @@ let
text = builtins.readFile ./entrypoint.sh;
};
runnerConfig = pkgs.writeTextDir "/etc/gitea-act-runner/config.yaml" (
config = pkgs.writeTextDir "/etc/gitea-act-runner/config.yaml" (
builtins.readFile (
(pkgs.formats.yaml { }).generate "config.yaml" {
runner.file = "/var/lib/gitea-act-runner/registration";
@@ -27,7 +27,7 @@ pkgs.dockerTools.buildImage {
paths = with pkgs; [
entrypoint
gitea-actions-runner
runnerConfig
config
curl
jq
];

View File

@@ -0,0 +1,37 @@
{ pkgs, ... }:
let
config = pkgs.writeTextDir "/etc/grafana-to-ntfy/Rocket.toml" (
builtins.readFile (
(pkgs.formats.toml { }).generate "Rocket.toml" {
global = {
port = 8080;
address = "0.0.0.0";
};
}
)
);
in
pkgs.dockerTools.buildImage {
name = "grafana-to-ntfy";
fromImage = pkgs.docker-image-base;
copyToRoot = pkgs.buildEnv {
name = "root";
paths = with pkgs; [
grafana-to-ntfy
config
];
pathsToLink = [
"/bin"
"/etc"
];
};
config = {
Entrypoint = [ "grafana-to-ntfy" ];
WorkingDir = "/etc/grafana-to-ntfy";
ExposedPorts = {
"8080/tcp" = { };
};
};
}