Add grafana alerts
Signed-off-by: Nikolaos Karaolidis <nick@karaolidis.com>
This commit is contained in:
8
flake.lock
generated
8
flake.lock
generated
@@ -511,11 +511,11 @@
|
||||
"secrets": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1757873556,
|
||||
"narHash": "sha256-WYrV46if1XsiQKOQEMNtHdAPeFDeu7YBdcoNSXc3sf8=",
|
||||
"lastModified": 1757935242,
|
||||
"narHash": "sha256-es2Uy3QsHtQieTpqSGtU7AePvxkJdLuwla9DcRTbQn4=",
|
||||
"ref": "refs/heads/main",
|
||||
"rev": "21ab0b0a59264b1da501f90725bf2c03e07ae941",
|
||||
"revCount": 43,
|
||||
"rev": "4da39113b1b5496b4351a3594c0f6efdbf0a7acf",
|
||||
"revCount": 44,
|
||||
"type": "git",
|
||||
"url": "ssh://git@karaolidis.com/karaolidis/nix-secrets.git"
|
||||
},
|
||||
|
@@ -0,0 +1,10 @@
|
||||
apiVersion: 1
|
||||
policies:
|
||||
- orgId: 1
|
||||
receiver: ntfy.sh
|
||||
group_by:
|
||||
- grafana_folder
|
||||
- alertname
|
||||
group_wait: 0s
|
||||
group_interval: 1m
|
||||
repeat_interval: 1h
|
@@ -0,0 +1,406 @@
|
||||
apiVersion: 1
|
||||
groups:
|
||||
- orgId: 1
|
||||
name: Default
|
||||
folder: System
|
||||
interval: 10s
|
||||
rules:
|
||||
- uid: cpu-usage
|
||||
title: CPU Usage
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: 1 - avg by(hostname) (rate(node_cpu_seconds_total{mode="idle"}[1h]))
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0.9
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
for: 30m
|
||||
keepFiringFor: 5m
|
||||
isPaused: false
|
||||
notification_settings:
|
||||
receiver: ntfy.sh
|
||||
- uid: memory-usage
|
||||
title: Memory Usage
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: 1 - (node_memory_MemAvailable_bytes{} / node_memory_MemTotal_bytes{})
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0.9
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
isPaused: false
|
||||
notification_settings:
|
||||
receiver: ntfy.sh
|
||||
- uid: cpu-temperature
|
||||
title: CPU Temperature
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: node_hwmon_temp_celsius{chip="pci0000:00_0000:00:18_3", sensor="temp1"}
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 75
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
for: 30m
|
||||
keepFiringFor: 5m
|
||||
isPaused: false
|
||||
notification_settings:
|
||||
receiver: ntfy.sh
|
||||
- uid: amabient-temperature
|
||||
title: Ambient Temperature
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: avg(node_hwmon_temp_celsius{chip="thermal_thermal_zone0"})
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 70
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
for: 15m
|
||||
keepFiringFor: 5m
|
||||
isPaused: false
|
||||
notification_settings:
|
||||
receiver: ntfy.sh
|
||||
- uid: smart-status
|
||||
title: SMART Status
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: smartctl_device_smart_status
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 1
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
keepFiringFor: 1h
|
||||
isPaused: false
|
||||
notification_settings:
|
||||
receiver: ntfy.sh
|
||||
- uid: smart-errors
|
||||
title: SMART Errors
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: "sum(\n increase(smartctl_device_attribute{attribute_value_type=\"raw\", attribute_name=~\"Raw_Read_Error_Rate|Seek_Error_Rate|Offline_Uncorrectable\"}[1h])\n) + \nsum(\n increase(smartctl_device_media_errors[1h])\n)"
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
keepFiringFor: 1h
|
||||
isPaused: false
|
||||
notification_settings:
|
||||
receiver: ntfy.sh
|
||||
- uid: smart-temperature
|
||||
title: SMART Temperature
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: smartctl_device_temperature
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 50
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
for: 1m
|
||||
keepFiringFor: 5m
|
||||
isPaused: false
|
||||
notification_settings:
|
||||
receiver: ntfy.sh
|
||||
- uid: btrfs-errors
|
||||
title: BTRFS Errors
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: |-
|
||||
sum by (btrfs_dev_uuid) (
|
||||
increase(node_btrfs_device_errors_total[1h])
|
||||
)
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
keepFiringFor: 1h
|
||||
isPaused: false
|
||||
notification_settings:
|
||||
receiver: ntfy.sh
|
@@ -18,6 +18,7 @@ in
|
||||
"grafana/authelia/password".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
|
||||
"grafana/authelia/digest".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
|
||||
"grafana/smtp".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
|
||||
"ntfy/tokens/jupiter/grafana".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
|
||||
};
|
||||
|
||||
templates = {
|
||||
@@ -114,6 +115,37 @@ in
|
||||
};
|
||||
}
|
||||
);
|
||||
|
||||
grafana-to-ntfy-env.content = ''
|
||||
BAUTH_PASS=${hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"}
|
||||
NTFY_BAUTH_PASS=${hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"}
|
||||
'';
|
||||
|
||||
grafana-contact-points.content = builtins.readFile (
|
||||
(pkgs.formats.yaml { }).generate "contact-points.yaml" {
|
||||
apiVersion = 1;
|
||||
contactPoints = [
|
||||
{
|
||||
orgId = 1;
|
||||
name = "ntfy.sh";
|
||||
receivers = [
|
||||
{
|
||||
uid = "ntfy";
|
||||
type = "webhook";
|
||||
settings = {
|
||||
httpMethod = "POST";
|
||||
url = "http://grafana-to-ntfy:8080";
|
||||
username = "jupiter";
|
||||
password = hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana";
|
||||
headers = { };
|
||||
};
|
||||
disableResolveMessage = false;
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
}
|
||||
);
|
||||
};
|
||||
};
|
||||
|
||||
@@ -136,7 +168,7 @@ in
|
||||
providers = [
|
||||
{
|
||||
name = "Default";
|
||||
folder = "";
|
||||
folder = "System";
|
||||
type = "file";
|
||||
url = "http://prometheus:9090";
|
||||
options.path = "/var/lib/grafana/dashboards";
|
||||
@@ -148,6 +180,9 @@ in
|
||||
"${hmConfig.sops.templates.grafana.path}:/etc/grafana/grafana.ini:ro"
|
||||
"${dashboards}:/etc/grafana/conf/provisioning/dashboards/default.yaml:ro"
|
||||
"${./dashboards}:/var/lib/grafana/dashboards:ro"
|
||||
"${./alerting/policies.yaml}:/etc/grafana/conf/provisioning/alerting/policies.yaml:ro"
|
||||
"${./alerting/rules.yaml}:/etc/grafana/conf/provisioning/alerting/rules.yaml:ro"
|
||||
"${hmConfig.sops.templates.grafana-contact-points.path}:/etc/grafana/conf/provisioning/alerting/contact-points.yaml:ro"
|
||||
];
|
||||
labels = [
|
||||
"traefik.enable=true"
|
||||
@@ -163,6 +198,17 @@ in
|
||||
networks = [ networks.grafana.ref ];
|
||||
};
|
||||
|
||||
grafana-to-ntfy.containerConfig = {
|
||||
image = "docker-archive:${pkgs.dockerImages.grafana-to-ntfy}";
|
||||
networks = [ networks.grafana.ref ];
|
||||
environments = {
|
||||
"NTFY_URL" = "https://ntfy.karaolidis.com/grafana";
|
||||
"NTFY_BAUTH_USER" = "jupiter";
|
||||
"BAUTH_USER" = "jupiter";
|
||||
};
|
||||
environmentFiles = [ hmConfig.sops.templates.grafana-to-ntfy-env.path ];
|
||||
};
|
||||
|
||||
authelia.containerConfig.volumes = [
|
||||
"${hmConfig.sops.templates.authelia-grafana.path}:/etc/authelia/conf.d/grafana.yaml:ro"
|
||||
];
|
||||
|
@@ -17,6 +17,8 @@ in
|
||||
"ntfy/webPush/publicKey".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
|
||||
"ntfy/webPush/privateKey".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
|
||||
"ntfy/users/karaolidis".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
|
||||
"ntfy/users/jupiter".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
|
||||
"ntfy/tokens/jupiter/grafana".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml";
|
||||
};
|
||||
|
||||
templates = {
|
||||
@@ -43,7 +45,14 @@ in
|
||||
auth-default-access = "deny-all";
|
||||
auth-startup-queries = dbStartupQueries;
|
||||
|
||||
auth-users = [ "karaolidis:${hmConfig.sops.placeholder."ntfy/users/karaolidis"}:admin" ];
|
||||
auth-users = [
|
||||
"jupiter:${hmConfig.sops.placeholder."ntfy/users/jupiter"}:user"
|
||||
"karaolidis:${hmConfig.sops.placeholder."ntfy/users/karaolidis"}:admin"
|
||||
];
|
||||
|
||||
auth-access = [ "jupiter:grafana:wo" ];
|
||||
|
||||
auth-tokens = [ "jupiter:${hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"}" ];
|
||||
|
||||
behind-proxy = true;
|
||||
|
||||
|
@@ -21,8 +21,9 @@ final: prev:
|
||||
gitea = final.docker-image-gitea;
|
||||
gitea-act-runner = final.docker-image-gitea-act-runner;
|
||||
gitea-act-runner-worker = final.docker-image-gitea-act-runner-worker;
|
||||
grafana = final.docker-image-grafana;
|
||||
grafana-image-renderer = final.docker-image-grafana-image-renderer;
|
||||
grafana-to-ntfy = final.docker-image-grafana-to-ntfy;
|
||||
grafana = final.docker-image-grafana;
|
||||
jellyfin = final.docker-image-jellyfin;
|
||||
jellyseerr = final.docker-image-jellyseerr;
|
||||
littlelink-server = final.docker-image-littlelink-server;
|
||||
|
@@ -14,8 +14,9 @@
|
||||
docker-image-gitea = import ./docker/gitea { inherit pkgs; };
|
||||
docker-image-gitea-act-runner = import ./docker/gitea-act-runner { inherit pkgs; };
|
||||
docker-image-gitea-act-runner-worker = import ./docker/gitea-act-runner-worker { inherit pkgs; };
|
||||
docker-image-grafana = import ./docker/grafana { inherit pkgs; };
|
||||
docker-image-grafana-image-renderer = import ./docker/grafana-image-renderer { inherit pkgs; };
|
||||
docker-image-grafana-to-ntfy = import ./docker/grafana-to-ntfy { inherit pkgs; };
|
||||
docker-image-grafana = import ./docker/grafana { inherit pkgs; };
|
||||
docker-image-jellyfin = import ./docker/jellyfin { inherit pkgs; };
|
||||
docker-image-jellyseerr = import ./docker/jellyseerr { inherit pkgs; };
|
||||
docker-image-littlelink-server = import ./docker/littlelink-server { inherit pkgs; };
|
||||
|
@@ -7,7 +7,7 @@ let
|
||||
text = builtins.readFile ./entrypoint.sh;
|
||||
};
|
||||
|
||||
runnerConfig = pkgs.writeTextDir "/etc/gitea-act-runner/config.yaml" (
|
||||
config = pkgs.writeTextDir "/etc/gitea-act-runner/config.yaml" (
|
||||
builtins.readFile (
|
||||
(pkgs.formats.yaml { }).generate "config.yaml" {
|
||||
runner.file = "/var/lib/gitea-act-runner/registration";
|
||||
@@ -27,7 +27,7 @@ pkgs.dockerTools.buildImage {
|
||||
paths = with pkgs; [
|
||||
entrypoint
|
||||
gitea-actions-runner
|
||||
runnerConfig
|
||||
config
|
||||
curl
|
||||
jq
|
||||
];
|
||||
|
37
packages/docker/grafana-to-ntfy/default.nix
Normal file
37
packages/docker/grafana-to-ntfy/default.nix
Normal file
@@ -0,0 +1,37 @@
|
||||
{ pkgs, ... }:
|
||||
let
|
||||
config = pkgs.writeTextDir "/etc/grafana-to-ntfy/Rocket.toml" (
|
||||
builtins.readFile (
|
||||
(pkgs.formats.toml { }).generate "Rocket.toml" {
|
||||
global = {
|
||||
port = 8080;
|
||||
address = "0.0.0.0";
|
||||
};
|
||||
}
|
||||
)
|
||||
);
|
||||
in
|
||||
pkgs.dockerTools.buildImage {
|
||||
name = "grafana-to-ntfy";
|
||||
fromImage = pkgs.docker-image-base;
|
||||
|
||||
copyToRoot = pkgs.buildEnv {
|
||||
name = "root";
|
||||
paths = with pkgs; [
|
||||
grafana-to-ntfy
|
||||
config
|
||||
];
|
||||
pathsToLink = [
|
||||
"/bin"
|
||||
"/etc"
|
||||
];
|
||||
};
|
||||
|
||||
config = {
|
||||
Entrypoint = [ "grafana-to-ntfy" ];
|
||||
WorkingDir = "/etc/grafana-to-ntfy";
|
||||
ExposedPorts = {
|
||||
"8080/tcp" = { };
|
||||
};
|
||||
};
|
||||
}
|
Submodule submodules/secrets updated: 21ab0b0a59...4da39113b1
Reference in New Issue
Block a user