Add monitoring

Signed-off-by: Yohann D'ANELLO <ynerant@crans.org>
2021-06-08 16:59:10 +02:00
parent f7de61b6e2
commit 2a5f6621b6
31 changed files with 895 additions and 0 deletions
--- a/roles/prometheus/handlers/main.yml
+++ b/roles/prometheus/handlers/main.yml
@@ -0,0 +1,5 @@
+---
+- name: Restart Prometheus
+  service:
+    name: prometheus
+    state: restarted
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@@ -0,0 +1,42 @@
+---
+- name: Install Prometheus
+  apt:
+    update_cache: true
+    name: prometheus
+  register: apt_result
+  retries: 3
+  until: apt_result is succeeded
+
+- name: Configure Prometheus
+  template:
+    src: prometheus/prometheus.yml.j2
+    dest: /etc/prometheus/prometheus.yml
+    mode: 0644
+  notify: Restart Prometheus
+
+- name: Configure Prometheus alert rules
+  template:
+    src: prometheus/alert.rules.yml.j2
+    dest: /etc/prometheus/alert.rules.yml
+    mode: 0644
+  notify: Restart Prometheus
+
+# We don't need to restart Prometheus when updating nodes
+- name: Configure Prometheus targets
+  copy:
+    content: "{{ [{'targets': item.value.targets}] | to_nice_json }}\n"
+    dest: "/etc/prometheus/{{ item.value.file }}"
+    mode: 0644
+  loop: "{{ prometheus | dict2items }}"
+
+- name: Activate prometheus service
+  systemd:
+    name: prometheus
+    enabled: true
+    state: started
+
+- name: Indicate role in motd
+  template:
+    src: update-motd.d/05-service.j2
+    dest: /etc/update-motd.d/05-prometheus
+    mode: 0755
--- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2
+++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2
@@ -0,0 +1,187 @@
+{{ ansible_header | comment }}
+{# As this is also Jinja2 it will conflict without a raw block #}
+{# Depending of Prometheus Node exporter version, rules can change depending of version #}
+{% raw %}
+groups:
+- name: alert.rules
+  rules:
+
+  # Alert for any instance that is unreachable for >3 minutes.
+  - alert: InstanceDown
+    expr: up == 0
+    for: 3m
+    labels:
+      severity: critical
+    annotations:
+      summary: "{{ $labels.instance }} ({{ $labels.job }}) est invisible depuis plus de 3 minutes !"
+
+  # Alert for out of memory
+  # Do not take into account memory not used by apps
+  - alert: OutOfMemory
+    expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Mémoire libre de {{ $labels.instance }} à {{ $value }}%."
+
+  # Alert for out of disk space
+  - alert: OutOfDiskSpace
+    expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value }}%."
+
+  # Alert for out of inode space on disk
+  - alert: OutOfInodes
+    expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}."
+
+  # Alert for high CPU usage
+  - alert: CpuBusy
+    expr: node_load5 > 9
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Charge sur {{ $labels.instance }} à {{ $value }}."
+
+  # Check mdadm software RAID
+  - alert: SoftwareRAIDDegraded
+    expr: node_md_disks-node_md_disks_active > 0
+    for: 3m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)."
+
+  # Check systemd unit (> buster)
+  - alert: SystemdServiceFailed
+    expr: node_systemd_unit_state{state="failed"} == 1
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
+
+  # Check UPS
+  - alert: UpsOutputSourceChanged
+    expr: upsOutputSource != 3
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "La source d'alimentation de {{ $labels.instance }} a changé !"
+      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+
+  - alert: UpsBatteryStatusChanged
+    expr: upsBatteryStatus != 2
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "L'état de la batterie de {{ $labels.instance }} a changé !"
+      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+
+  - alert: UpsTemperatureWarning
+    expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26)
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C."
+      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+
+  - alert: UpsTemperatureCritical
+    expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30)
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C !"
+      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+
+  - alert: UpsHighHumidity
+    expr: xupsEnvRemoteHumidity > 65
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}%."
+      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+
+  - alert: UpsVeryHighHumidity
+    expr: xupsEnvRemoteHumidity > 85
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}% !"
+      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+
+  - alert: UpsHighLoad
+    expr: upsOutputPercentLoad > 70
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !"
+      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+
+  - alert: UpsWrongInputVoltage
+    expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V."
+      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+
+  - alert: UpsWrongOutputVoltage
+    expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V."
+      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
+
+  - alert: AptAutoremovePending
+    expr: apt_autoremove_pending > 0
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}."
+
+  - alert: MailqNotEmpty
+    expr: postfix_mailq_length > 25
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}."
+
+  - alert: NoRadiusLogin
+    expr: rate(radiusd_access_ok[3m]) == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Personne ne vient taper le RADIUS."
+
+  - alert: TooManyReallocatedSectors
+    expr: smartmon_reallocated_sector_ct_raw_value > 1e3
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués."
+
+{% endraw %}
--- a/roles/prometheus/templates/prometheus/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2
@@ -0,0 +1,42 @@
+{{ ansible_header | comment }}
+
+global:
+  # scrape_interval is set to the global default (60s)
+  # evaluation_interval is set to the global default (60s)
+  # scrape_timeout is set to the global default (10s).
+
+  # Attach these labels to any time series or alerts when communicating with
+  # external systems (federation, remote storage, Alertmanager).
+  external_labels:
+      monitor: 'example'
+
+# Alertmanager configuration
+# Use prometheus alertmanager installed on the same machine
+alerting:
+  alertmanagers:
+  - static_configs:
+    - targets: ['localhost:9093']
+
+# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
+rule_files:
+  - "alert.rules.yml"  # Monitoring alerts, this is the file you may be searching!
+
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+{{
+  {
+    "scrape_configs":
+    [
+      {
+        "job_name": "prometheus",
+        "static_configs" : [
+          {
+            "targets": [
+               "localhost:9090"
+            ]
+          }
+        ]
+      }
+    ] + (prometheus | json_query("*.config[0]"))
+  } | to_nice_yaml(indent=2)
+}}
--- a/roles/prometheus/templates/update-motd.d/05-service.j2
+++ b/roles/prometheus/templates/update-motd.d/05-service.j2
@@ -0,0 +1,3 @@
+#!/usr/bin/tail +14
+{{ ansible_header | comment }}
+[0m> [38;5;82mprometheus[0m a été déployé sur cette machine. Voir [38;5;6m/etc/prometheus/[0m.