diff --git a/flake.lock b/flake.lock index b4f7f49..0f371d5 100644 --- a/flake.lock +++ b/flake.lock @@ -511,11 +511,11 @@ "secrets": { "flake": false, "locked": { - "lastModified": 1757873556, - "narHash": "sha256-WYrV46if1XsiQKOQEMNtHdAPeFDeu7YBdcoNSXc3sf8=", + "lastModified": 1757935242, + "narHash": "sha256-es2Uy3QsHtQieTpqSGtU7AePvxkJdLuwla9DcRTbQn4=", "ref": "refs/heads/main", - "rev": "21ab0b0a59264b1da501f90725bf2c03e07ae941", - "revCount": 43, + "rev": "4da39113b1b5496b4351a3594c0f6efdbf0a7acf", + "revCount": 44, "type": "git", "url": "ssh://git@karaolidis.com/karaolidis/nix-secrets.git" }, diff --git a/hosts/jupiter/users/storm/configs/console/podman/grafana/alerting/policies.yaml b/hosts/jupiter/users/storm/configs/console/podman/grafana/alerting/policies.yaml new file mode 100644 index 0000000..2e69ed6 --- /dev/null +++ b/hosts/jupiter/users/storm/configs/console/podman/grafana/alerting/policies.yaml @@ -0,0 +1,10 @@ +apiVersion: 1 +policies: + - orgId: 1 + receiver: ntfy.sh + group_by: + - grafana_folder + - alertname + group_wait: 0s + group_interval: 1m + repeat_interval: 1h diff --git a/hosts/jupiter/users/storm/configs/console/podman/grafana/alerting/rules.yaml b/hosts/jupiter/users/storm/configs/console/podman/grafana/alerting/rules.yaml new file mode 100644 index 0000000..12d40a1 --- /dev/null +++ b/hosts/jupiter/users/storm/configs/console/podman/grafana/alerting/rules.yaml @@ -0,0 +1,406 @@ +apiVersion: 1 +groups: + - orgId: 1 + name: Default + folder: System + interval: 10s + rules: + - uid: cpu-usage + title: CPU Usage + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + editorMode: code + expr: 1 - avg by(hostname) (rate(node_cpu_seconds_total{mode="idle"}[1h])) + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0.9 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 30m + keepFiringFor: 5m + isPaused: false + notification_settings: + receiver: ntfy.sh + - uid: memory-usage + title: Memory Usage + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + editorMode: code + expr: 1 - (node_memory_MemAvailable_bytes{} / node_memory_MemTotal_bytes{}) + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0.9 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + isPaused: false + notification_settings: + receiver: ntfy.sh + - uid: cpu-temperature + title: CPU Temperature + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + editorMode: code + expr: node_hwmon_temp_celsius{chip="pci0000:00_0000:00:18_3", sensor="temp1"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 75 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 30m + keepFiringFor: 5m + isPaused: false + notification_settings: + receiver: ntfy.sh + - uid: amabient-temperature + title: Ambient Temperature + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + editorMode: code + expr: avg(node_hwmon_temp_celsius{chip="thermal_thermal_zone0"}) + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 70 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 15m + keepFiringFor: 5m + isPaused: false + notification_settings: + receiver: ntfy.sh + - uid: smart-status + title: SMART Status + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + editorMode: code + expr: smartctl_device_smart_status + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 1 + type: lt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + keepFiringFor: 1h + isPaused: false + notification_settings: + receiver: ntfy.sh + - uid: smart-errors + title: SMART Errors + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + editorMode: code + expr: "sum(\n increase(smartctl_device_attribute{attribute_value_type=\"raw\", attribute_name=~\"Raw_Read_Error_Rate|Seek_Error_Rate|Offline_Uncorrectable\"}[1h])\n) + \nsum(\n increase(smartctl_device_media_errors[1h])\n)" + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + keepFiringFor: 1h + isPaused: false + notification_settings: + receiver: ntfy.sh + - uid: smart-temperature + title: SMART Temperature + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + editorMode: code + expr: smartctl_device_temperature + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 50 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 1m + keepFiringFor: 5m + isPaused: false + notification_settings: + receiver: ntfy.sh + - uid: btrfs-errors + title: BTRFS Errors + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + editorMode: code + expr: |- + sum by (btrfs_dev_uuid) ( + increase(node_btrfs_device_errors_total[1h]) + ) + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + keepFiringFor: 1h + isPaused: false + notification_settings: + receiver: ntfy.sh diff --git a/hosts/jupiter/users/storm/configs/console/podman/grafana/default.nix b/hosts/jupiter/users/storm/configs/console/podman/grafana/default.nix index 457c613..d767143 100644 --- a/hosts/jupiter/users/storm/configs/console/podman/grafana/default.nix +++ b/hosts/jupiter/users/storm/configs/console/podman/grafana/default.nix @@ -18,6 +18,7 @@ in "grafana/authelia/password".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml"; "grafana/authelia/digest".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml"; "grafana/smtp".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml"; + "ntfy/tokens/jupiter/grafana".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml"; }; templates = { @@ -114,6 +115,37 @@ in }; } ); + + grafana-to-ntfy-env.content = '' + BAUTH_PASS=${hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"} + NTFY_BAUTH_PASS=${hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"} + ''; + + grafana-contact-points.content = builtins.readFile ( + (pkgs.formats.yaml { }).generate "contact-points.yaml" { + apiVersion = 1; + contactPoints = [ + { + orgId = 1; + name = "ntfy.sh"; + receivers = [ + { + uid = "ntfy"; + type = "webhook"; + settings = { + httpMethod = "POST"; + url = "http://grafana-to-ntfy:8080"; + username = "jupiter"; + password = hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"; + headers = { }; + }; + disableResolveMessage = false; + } + ]; + } + ]; + } + ); }; }; @@ -136,7 +168,7 @@ in providers = [ { name = "Default"; - folder = ""; + folder = "System"; type = "file"; url = "http://prometheus:9090"; options.path = "/var/lib/grafana/dashboards"; @@ -148,6 +180,9 @@ in "${hmConfig.sops.templates.grafana.path}:/etc/grafana/grafana.ini:ro" "${dashboards}:/etc/grafana/conf/provisioning/dashboards/default.yaml:ro" "${./dashboards}:/var/lib/grafana/dashboards:ro" + "${./alerting/policies.yaml}:/etc/grafana/conf/provisioning/alerting/policies.yaml:ro" + "${./alerting/rules.yaml}:/etc/grafana/conf/provisioning/alerting/rules.yaml:ro" + "${hmConfig.sops.templates.grafana-contact-points.path}:/etc/grafana/conf/provisioning/alerting/contact-points.yaml:ro" ]; labels = [ "traefik.enable=true" @@ -163,6 +198,17 @@ in networks = [ networks.grafana.ref ]; }; + grafana-to-ntfy.containerConfig = { + image = "docker-archive:${pkgs.dockerImages.grafana-to-ntfy}"; + networks = [ networks.grafana.ref ]; + environments = { + "NTFY_URL" = "https://ntfy.karaolidis.com/grafana"; + "NTFY_BAUTH_USER" = "jupiter"; + "BAUTH_USER" = "jupiter"; + }; + environmentFiles = [ hmConfig.sops.templates.grafana-to-ntfy-env.path ]; + }; + authelia.containerConfig.volumes = [ "${hmConfig.sops.templates.authelia-grafana.path}:/etc/authelia/conf.d/grafana.yaml:ro" ]; diff --git a/hosts/jupiter/users/storm/configs/console/podman/ntfy/default.nix b/hosts/jupiter/users/storm/configs/console/podman/ntfy/default.nix index fbf3404..8ae6d54 100644 --- a/hosts/jupiter/users/storm/configs/console/podman/ntfy/default.nix +++ b/hosts/jupiter/users/storm/configs/console/podman/ntfy/default.nix @@ -17,6 +17,8 @@ in "ntfy/webPush/publicKey".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml"; "ntfy/webPush/privateKey".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml"; "ntfy/users/karaolidis".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml"; + "ntfy/users/jupiter".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml"; + "ntfy/tokens/jupiter/grafana".sopsFile = "${inputs.secrets}/hosts/jupiter/secrets.yaml"; }; templates = { @@ -43,7 +45,14 @@ in auth-default-access = "deny-all"; auth-startup-queries = dbStartupQueries; - auth-users = [ "karaolidis:${hmConfig.sops.placeholder."ntfy/users/karaolidis"}:admin" ]; + auth-users = [ + "jupiter:${hmConfig.sops.placeholder."ntfy/users/jupiter"}:user" + "karaolidis:${hmConfig.sops.placeholder."ntfy/users/karaolidis"}:admin" + ]; + + auth-access = [ "jupiter:grafana:wo" ]; + + auth-tokens = [ "jupiter:${hmConfig.sops.placeholder."ntfy/tokens/jupiter/grafana"}" ]; behind-proxy = true; diff --git a/overlays/default.nix b/overlays/default.nix index b20b238..bb4a981 100644 --- a/overlays/default.nix +++ b/overlays/default.nix @@ -21,8 +21,9 @@ final: prev: gitea = final.docker-image-gitea; gitea-act-runner = final.docker-image-gitea-act-runner; gitea-act-runner-worker = final.docker-image-gitea-act-runner-worker; - grafana = final.docker-image-grafana; grafana-image-renderer = final.docker-image-grafana-image-renderer; + grafana-to-ntfy = final.docker-image-grafana-to-ntfy; + grafana = final.docker-image-grafana; jellyfin = final.docker-image-jellyfin; jellyseerr = final.docker-image-jellyseerr; littlelink-server = final.docker-image-littlelink-server; diff --git a/packages/default.nix b/packages/default.nix index 04ded9f..6207872 100644 --- a/packages/default.nix +++ b/packages/default.nix @@ -14,8 +14,9 @@ docker-image-gitea = import ./docker/gitea { inherit pkgs; }; docker-image-gitea-act-runner = import ./docker/gitea-act-runner { inherit pkgs; }; docker-image-gitea-act-runner-worker = import ./docker/gitea-act-runner-worker { inherit pkgs; }; - docker-image-grafana = import ./docker/grafana { inherit pkgs; }; docker-image-grafana-image-renderer = import ./docker/grafana-image-renderer { inherit pkgs; }; + docker-image-grafana-to-ntfy = import ./docker/grafana-to-ntfy { inherit pkgs; }; + docker-image-grafana = import ./docker/grafana { inherit pkgs; }; docker-image-jellyfin = import ./docker/jellyfin { inherit pkgs; }; docker-image-jellyseerr = import ./docker/jellyseerr { inherit pkgs; }; docker-image-littlelink-server = import ./docker/littlelink-server { inherit pkgs; }; diff --git a/packages/docker/gitea-act-runner/default.nix b/packages/docker/gitea-act-runner/default.nix index 1376728..caf69e6 100644 --- a/packages/docker/gitea-act-runner/default.nix +++ b/packages/docker/gitea-act-runner/default.nix @@ -7,7 +7,7 @@ let text = builtins.readFile ./entrypoint.sh; }; - runnerConfig = pkgs.writeTextDir "/etc/gitea-act-runner/config.yaml" ( + config = pkgs.writeTextDir "/etc/gitea-act-runner/config.yaml" ( builtins.readFile ( (pkgs.formats.yaml { }).generate "config.yaml" { runner.file = "/var/lib/gitea-act-runner/registration"; @@ -27,7 +27,7 @@ pkgs.dockerTools.buildImage { paths = with pkgs; [ entrypoint gitea-actions-runner - runnerConfig + config curl jq ]; diff --git a/packages/docker/grafana-to-ntfy/default.nix b/packages/docker/grafana-to-ntfy/default.nix new file mode 100644 index 0000000..74c5b41 --- /dev/null +++ b/packages/docker/grafana-to-ntfy/default.nix @@ -0,0 +1,37 @@ +{ pkgs, ... }: +let + config = pkgs.writeTextDir "/etc/grafana-to-ntfy/Rocket.toml" ( + builtins.readFile ( + (pkgs.formats.toml { }).generate "Rocket.toml" { + global = { + port = 8080; + address = "0.0.0.0"; + }; + } + ) + ); +in +pkgs.dockerTools.buildImage { + name = "grafana-to-ntfy"; + fromImage = pkgs.docker-image-base; + + copyToRoot = pkgs.buildEnv { + name = "root"; + paths = with pkgs; [ + grafana-to-ntfy + config + ]; + pathsToLink = [ + "/bin" + "/etc" + ]; + }; + + config = { + Entrypoint = [ "grafana-to-ntfy" ]; + WorkingDir = "/etc/grafana-to-ntfy"; + ExposedPorts = { + "8080/tcp" = { }; + }; + }; +} diff --git a/submodules/secrets b/submodules/secrets index 21ab0b0..4da3911 160000 --- a/submodules/secrets +++ b/submodules/secrets @@ -1 +1 @@ -Subproject commit 21ab0b0a59264b1da501f90725bf2c03e07ae941 +Subproject commit 4da39113b1b5496b4351a3594c0f6efdbf0a7acf