diff --git a/group_vars/all/prometheus_node_exporter.yaml b/group_vars/all/prometheus_node_exporter.yaml new file mode 100644 index 0000000..39212fa --- /dev/null +++ b/group_vars/all/prometheus_node_exporter.yaml @@ -0,0 +1,3 @@ +--- +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}" diff --git a/group_vars/grafana.yml b/group_vars/grafana.yml new file mode 100644 index 0000000..ab15f6a --- /dev/null +++ b/group_vars/grafana.yml @@ -0,0 +1,7 @@ +--- +glob_grafana: + root_url: https://grafana.ynerant.fr + icon: crans_icon_white.svg + ldap_base: "{{ glob_ldap.base }}" + ldap_master_ipv4: "{{ glob_ldap.servers[0] }}" + ldap_user_tree: "ou=passwd,{{ glob_ldap.base }}" diff --git a/group_vars/nginx.yml b/group_vars/nginx.yml index b883780..a038952 100644 --- a/group_vars/nginx.yml +++ b/group_vars/nginx.yml @@ -30,3 +30,6 @@ glob_nginx: - "172.16.0.0/16" - "fd00:0:0:42::/64" deploy_robots_file: false + +glob_prometheus_nginx_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}" diff --git a/group_vars/prometheus.yml b/group_vars/prometheus.yml new file mode 100644 index 0000000..ac686ca --- /dev/null +++ b/group_vars/prometheus.yml @@ -0,0 +1,9 @@ +--- +glob_prometheus: {} + +glob_ninjabot: + config: + nick: templier + server: irc.crans.org + port: 6667 + channel: "#/dev/null" diff --git a/host_vars/monitoring.adm.ynerant.fr.yml b/host_vars/monitoring.adm.ynerant.fr.yml index 2eb6f99..e393cdd 100644 --- a/host_vars/monitoring.adm.ynerant.fr.yml +++ b/host_vars/monitoring.adm.ynerant.fr.yml @@ -2,3 +2,69 @@ interfaces: adm: eth0 srv_nat: eth1 + +loc_prometheus: + node: + file: targets_node.json + targets: "{{ groups['server'] | select('match', '^.*\\.adm\\.ynerant\\.fr$') | list | sort }}" + config: + - job_name: servers + file_sd_configs: + - files: + - '/etc/prometheus/targets_node.json' + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:9100' + + nginx: + file: targets_nginx.json + targets: + - proxy.adm.ynerant.fr + config: + - job_name: nginx + file_sd_configs: + - files: + - '/etc/prometheus/targets_nginx.json' + relabel_configs: + - source_labels: [__address__] + target_label: instance + - source_labels: [instance] + target_label: __address__ + replacement: '$1:9117' + + blackbox: + file: targets_blackbox.json + targets: + - https://ynerant.fr/ + - https://bibliogram.ynerant.fr/ + - https://element.ynerant.fr/ + - https://gitea.ynerant.fr/ + - https://grafana.ynerant.fr/ + - https://hydrogen.ynerant.fr/ + - https://nextcloud.ynerant.fr/ + - https://mailu.ynerant.fr/ + - http://notls.ynerant.fr/ + - https://reddit.ynerant.fr/ + - https://thelounge.ynerant.fr/ + - https://translate.ynerant.fr/ + - https://kfet.saperlistpopette.fr/ + config: + - job_name: blackbox + file_sd_configs: + - files: + - '/etc/prometheus/targets_blackbox.json' + metrics_path: /probe + params: + module: [http_2xx] # Look for a HTTP 200 response. + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 127.0.0.1:9115 diff --git a/hosts b/hosts index 802cb3d..c6060b6 100644 --- a/hosts +++ b/hosts @@ -10,12 +10,18 @@ babel4.adm.ynerant.fr babel5.adm.ynerant.fr babel6.adm.ynerant.fr +[blackbox] +monitoring.adm.ynerant.fr + [certbot] proxy.adm.ynerant.fr [debian:children] server +[grafana] +monitoring.adm.ynerant.fr + [nginx:children] reverseproxy @@ -30,6 +36,9 @@ localhost [postfix] mailu.adm.ynerant.fr +[prometheus] +monitoring.adm.ynerant.fr + [reverseproxy] proxy.adm.ynerant.fr diff --git a/plays/monitoring.yml b/plays/monitoring.yml new file mode 100755 index 0000000..18b08fd --- /dev/null +++ b/plays/monitoring.yml @@ -0,0 +1,38 @@ +#!/usr/bin/env ansible-playbook +--- + +# Deploy Prometheus on monitoring server +- hosts: prometheus + vars: + prometheus: "{{ glob_prometheus | default({}) | combine(loc_prometheus | default({})) }}" + alertmanager: "{{ glob_alertmanager | default({}) | combine(loc_alertmanager | default({})) }}" + ninjabot: "{{ glob_ninjabot | default({}) | combine(loc_ninjabot | default({})) }}" + roles: + - prometheus + - prometheus-alertmanager + - ninjabot + +# Deploy Grafana on monitoring server +- hosts: grafana + vars: + grafana: "{{ glob_grafana | default({}) | combine(loc_grafana | default({})) }}" + roles: + - grafana + +- hosts: blackbox + roles: + - prometheus-blackbox-exporter + +# Monitor all hosts +- hosts: server + vars: + prometheus_node_exporter: "{{ glob_prometheus_node_exporter | default({}) | combine(loc_prometheus_node_exporter | default({})) }}" + roles: + - prometheus-node-exporter + +# Export nginx metrics +- hosts: nginx + vars: + prometheus_nginx_exporter: "{{ glob_prometheus_nginx_exporter | default({}) | combine(loc_prometheus_nginx_exporter | default({})) }}" + roles: + - prometheus-nginx-exporter diff --git a/roles/grafana/handlers/main.yml b/roles/grafana/handlers/main.yml new file mode 100644 index 0000000..cbd4ffd --- /dev/null +++ b/roles/grafana/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart grafana + service: + name: grafana-server + state: restarted diff --git a/roles/grafana/tasks/main.yml b/roles/grafana/tasks/main.yml new file mode 100644 index 0000000..eb3a091 --- /dev/null +++ b/roles/grafana/tasks/main.yml @@ -0,0 +1,100 @@ +--- +- name: Install GPG + apt: + name: gnupg + state: present + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Import Grafana GPG signing key + apt_key: + url: https://packages.grafana.com/gpg.key + state: present + validate_certs: false + register: apt_key_result + retries: 3 + until: apt_key_result is succeeded + +- name: Add Grafana repository + apt_repository: + repo: deb http://mirror.adm.ynerant.fr/grafana/oss/deb stable main + state: present + update_cache: true + +- name: Install Grafana + apt: + name: grafana + state: present + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Configure Grafana + ini_file: + path: /etc/grafana/grafana.ini + section: "{{ item.section }}" + option: "{{ item.option }}" + value: "{{ item.value }}" + mode: 0640 + loop: + - section: server + option: root_url + value: "{{ grafana.root_url }}" + - section: analytics + option: reporting_enabled + value: "false" + - section: analytics + option: check_for_updates + value: "false" + - section: security + option: disable_initial_admin_creation + value: "true" + - section: security + option: cookie_secure + value: "true" + - section: snapshots + option: external_enabled + value: "false" + - section: users + option: allow_sign_up + value: "false" + - section: users + option: allow_org_create + value: "false" + - section: auth.anonymous + option: enabled + value: "true" + - section: auth.anonymous + option: hide_version + value: "true" + - section: auth.basic # Only LDAP auth + option: enabled + value: "false" + - section: auth.ldap + option: enabled + value: "true" + - section: alerting + option: enabled + value: "false" + notify: Restart grafana + +- name: Configure Grafana LDAP + template: + src: ldap.toml.j2 + dest: /etc/grafana/ldap.toml + mode: 0640 + notify: Restart grafana + +- name: Enable and start Grafana + systemd: + name: grafana-server + enabled: true + state: started + daemon_reload: true + +- name: Indicate role in motd + template: + src: update-motd.d/05-service.j2 + dest: /etc/update-motd.d/05-grafana + mode: 0755 diff --git a/roles/grafana/templates/ldap.toml.j2 b/roles/grafana/templates/ldap.toml.j2 new file mode 100644 index 0000000..5d3e9aa --- /dev/null +++ b/roles/grafana/templates/ldap.toml.j2 @@ -0,0 +1,47 @@ +{{ ansible_header | comment }} +# To troubleshoot and get more log info enable ldap debug logging in grafana.ini +# [log] +# filters = ldap:debug + +[[servers]] +# Ldap server host (specify multiple hosts space separated) +host = "{{ grafana.ldap_master_ipv4 }}" +# Default port is 389 or 636 if use_ssl = true +port = 636 +# Set to true if ldap server supports TLS +use_ssl = true +# Set to true if connect ldap server with STARTTLS pattern (create connection in insecure, then upgrade to secure connection with TLS) +start_tls = false +# set to true if you want to skip ssl cert validation +ssl_skip_verify = true +# set to the path to your root CA certificate or leave unset to use system defaults +# root_ca_cert = "/path/to/certificate.crt" +# Authentication against LDAP servers requiring client certificates +# client_cert = "/path/to/client.crt" +# client_key = "/path/to/client.key" + +# Use direct bind +bind_dn = "uid=%s,{{ grafana.ldap_user_tree }}" + +# Useless as we are doing direct bind, +# but without LDAP auth hang +search_filter = "(uid=%s)" +search_base_dns = ["ou=passwd,dc=ynerant,dc=fr"] + +## For Posix or LDAP setups that does not support member_of attribute you can define the below settings +## Please check grafana LDAP docs for examples +group_search_filter = "(&(objectClass=posixGroup)(memberUid=%s))" +group_search_base_dns = ["ou=group,{{ grafana.ldap_base }}"] +group_search_filter_user_attribute = "cn" + +# Specify names of the ldap attributes your ldap uses +[servers.attributes] +name = "givenName" +surname = "sn" +username = "uid" +email = "mail" + +# All LDAP members can edit +[[servers.group_mappings]] +group_dn = "*" +org_role = "Admin" diff --git a/roles/grafana/templates/update-motd.d/05-service.j2 b/roles/grafana/templates/update-motd.d/05-service.j2 new file mode 100755 index 0000000..d38cb7f --- /dev/null +++ b/roles/grafana/templates/update-motd.d/05-service.j2 @@ -0,0 +1,3 @@ +#!/usr/bin/tail +14 +{{ ansible_header | comment }} +> grafana a été déployé sur cette machine. Voir /etc/grafana/. diff --git a/roles/ninjabot/tasks/main.yml b/roles/ninjabot/tasks/main.yml new file mode 100644 index 0000000..83c0870 --- /dev/null +++ b/roles/ninjabot/tasks/main.yml @@ -0,0 +1,48 @@ +--- +- name: Install python3 IRC library + apt: + name: python3-irc + state: present + update_cache: true + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Install Flask for python3 + apt: + name: python3-flask + state: present + update_cache: true + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Clone NinjaBot code + git: + repo: https://gitlab.crans.org/nounous/NinjaBot.git + dest: /var/local/ninjabot + version: master + +- name: Deploy NinjaBot configuration + template: + src: ninjabot/ninjabot.json.j2 + dest: /var/local/ninjabot/ninjabot.json + +- name: Deploy NinjaBot systemd unit + template: + src: systemd/system/ninjabot.service.j2 + dest: /etc/systemd/system/ninjabot.service + mode: 0644 + +- name: Load and activate NinjaBot service + systemd: + name: ninjabot + daemon_reload: true + enabled: true + state: started + +- name: Indicate NinjaBot in motd + template: + src: update-motd.d/05-service.j2 + dest: /etc/update-motd.d/05-ninjabot + mode: 0755 diff --git a/roles/ninjabot/templates/ninjabot/ninjabot.json.j2 b/roles/ninjabot/templates/ninjabot/ninjabot.json.j2 new file mode 100644 index 0000000..d0296ae --- /dev/null +++ b/roles/ninjabot/templates/ninjabot/ninjabot.json.j2 @@ -0,0 +1 @@ +{{ ninjabot.config | to_nice_json(indent=2) }} diff --git a/roles/ninjabot/templates/systemd/system/ninjabot.service.j2 b/roles/ninjabot/templates/systemd/system/ninjabot.service.j2 new file mode 100644 index 0000000..526a296 --- /dev/null +++ b/roles/ninjabot/templates/systemd/system/ninjabot.service.j2 @@ -0,0 +1,15 @@ +{{ ansible_header | comment }} +[Unit] +Description=NinjaBot IRC bot +After=network.target + +[Service] +Type=simple +WorkingDirectory=/var/local/ninjabot +User=nobody +Group=nogroup +ExecStart=/usr/bin/python3 /var/local/ninjabot/ninjabot.py +Restart=always + +[Install] +WantedBy=multi-user.target diff --git a/roles/ninjabot/templates/update-motd.d/05-service.j2 b/roles/ninjabot/templates/update-motd.d/05-service.j2 new file mode 100755 index 0000000..0ac4bcd --- /dev/null +++ b/roles/ninjabot/templates/update-motd.d/05-service.j2 @@ -0,0 +1,3 @@ +#!/usr/bin/tail +14 +{{ ansible_header | comment }} +> NinjaBot a été déployé sur cette machine. Voir /var/local/ninjabot/. diff --git a/roles/prometheus-alertmanager/handlers/main.yml b/roles/prometheus-alertmanager/handlers/main.yml new file mode 100644 index 0000000..3ddbf93 --- /dev/null +++ b/roles/prometheus-alertmanager/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart Prometheus Alertmanager + service: + name: prometheus-alertmanager + state: restarted diff --git a/roles/prometheus-alertmanager/tasks/main.yml b/roles/prometheus-alertmanager/tasks/main.yml new file mode 100644 index 0000000..b65a295 --- /dev/null +++ b/roles/prometheus-alertmanager/tasks/main.yml @@ -0,0 +1,14 @@ +--- +- name: Install Prometheus Alertmanager + apt: + update_cache: true + name: prometheus-alertmanager + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Configure Prometheus Alertmanager + template: + src: prometheus/alertmanager.yml.j2 + dest: /etc/prometheus/alertmanager.yml + notify: Restart Prometheus Alertmanager diff --git a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 new file mode 100644 index 0000000..4c10974 --- /dev/null +++ b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 @@ -0,0 +1,60 @@ +{{ ansible_header | comment }} +# See https://prometheus.io/docs/alerting/configuration/ for documentation. + +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'localhost:25' + smtp_from: 'alertmanager@example.org' + #smtp_auth_username: 'alertmanager' + #smtp_auth_password: 'password' + +# The directory from which notification templates are read. +templates: +- '/etc/prometheus/alertmanager_templates/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['instance'] # group per instance + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 24h + + # A default receiver + receiver: webhook-ninjabot + + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['alertname', 'cluster', 'service'] + + +receivers: +- name: 'webhook-ninjabot' + webhook_configs: + - url: 'http://localhost:5000/' + send_resolved: true + - url: 'http://localhost:8000/' + send_resolved: true diff --git a/roles/prometheus-blackbox-exporter/handlers/main.yml b/roles/prometheus-blackbox-exporter/handlers/main.yml new file mode 100644 index 0000000..72a6fc5 --- /dev/null +++ b/roles/prometheus-blackbox-exporter/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart prometheus-blackbox-exporter + service: + name: prometheus-blackbox-exporter + state: restarted diff --git a/roles/prometheus-blackbox-exporter/tasks/main.yml b/roles/prometheus-blackbox-exporter/tasks/main.yml new file mode 100644 index 0000000..a571e90 --- /dev/null +++ b/roles/prometheus-blackbox-exporter/tasks/main.yml @@ -0,0 +1,23 @@ +--- +- name: Install Prometheus Blackbox exporter + apt: + update_cache: true + name: prometheus-blackbox-exporter + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Make Prometheus Blackbox exporter listen on localhost only + lineinfile: + path: /etc/default/prometheus-blackbox-exporter + regexp: '^ARGS=' + line: > + ARGS='--config.file /etc/prometheus/blackbox.yml + --web.listen-address="localhost:9115"' + notify: Restart prometheus-blackbox-exporter + +- name: Activate prometheus Blackbox exporter service + systemd: + name: prometheus-blackbox-exporter + enabled: true + state: started diff --git a/roles/prometheus-nginx-exporter/handlers/main.yml b/roles/prometheus-nginx-exporter/handlers/main.yml new file mode 100644 index 0000000..23b542b --- /dev/null +++ b/roles/prometheus-nginx-exporter/handlers/main.yml @@ -0,0 +1,10 @@ +--- +- name: Restart nginx + service: + name: nginx + state: restarted + +- name: Restart prometheus-nginx-exporter + service: + name: prometheus-nginx-exporter + state: restarted diff --git a/roles/prometheus-nginx-exporter/tasks/main.yml b/roles/prometheus-nginx-exporter/tasks/main.yml new file mode 100644 index 0000000..0fb8001 --- /dev/null +++ b/roles/prometheus-nginx-exporter/tasks/main.yml @@ -0,0 +1,33 @@ +--- +- name: Install prometheus-nginx-exporter + apt: + update_cache: true + name: + - nginx # Nginx may be not already installed + - prometheus-nginx-exporter + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Make prometheus-nginx-exporter listen on adm only + lineinfile: + path: /etc/default/prometheus-nginx-exporter + regexp: '^ARGS=' + line: | + ARGS="-web.listen-address={{ prometheus_nginx_exporter.listen_addr }}:9117 -nginx.scrape-uri=http://[::1]:6424/stub_status" + notify: + - Restart nginx + - Restart prometheus-nginx-exporter + +- name: Configure nginx + template: + src: nginx/status.j2 + dest: /etc/nginx/sites-available/status + notify: Restart nginx + +- name: Activate nginx site + file: + src: /etc/nginx/sites-available/status + dest: /etc/nginx/sites-enabled/status + state: link + notify: Restart nginx diff --git a/roles/prometheus-nginx-exporter/templates/nginx/status.j2 b/roles/prometheus-nginx-exporter/templates/nginx/status.j2 new file mode 100644 index 0000000..b9ed48c --- /dev/null +++ b/roles/prometheus-nginx-exporter/templates/nginx/status.j2 @@ -0,0 +1,8 @@ +{{ ansible_header | comment }} + +server { + listen [::1]:6424; + location = /stub_status { + stub_status; + } +} diff --git a/roles/prometheus-node-exporter/files/apt.sh b/roles/prometheus-node-exporter/files/apt.sh new file mode 100755 index 0000000..7333a67 --- /dev/null +++ b/roles/prometheus-node-exporter/files/apt.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# +# Description: Expose metrics from apt updates. +# +# Author: Ben Kochie + +upgrades="$(/usr/bin/apt-get --just-print dist-upgrade \ + | /usr/bin/awk -F'[()]' \ + '/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2); + sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \ + | /usr/bin/sort \ + | /usr/bin/uniq -c \ + | awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/\"/, "\\\"", $2); + gsub(/\[/, "", $3); gsub(/\]/, "", $3); + print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $NF "\"} " $1}' +)" + +autoremove="$(/usr/bin/apt-get --just-print autoremove \ + | /usr/bin/awk '/^Remv/{a++}END{printf "apt_autoremove_pending %d", a}' +)" + +orphans="$(comm -23 \ + <(dpkg-query -W -f '${db:Status-Abbrev}\t${Package}\n' \ + | grep '^.[^nc]' | cut -f2 | sort) \ + <(apt-cache dumpavail | sed -rn 's/^Package: (.*)/\1/p' | sort -u) \ + | awk 'END{printf "apt_orphans %d", NR}' +)" + +echo '# HELP apt_upgrades_pending Apt package pending updates by origin.' +echo '# TYPE apt_upgrades_pending gauge' +if [[ -n "${upgrades}" ]] ; then + echo "${upgrades}" +else + echo 'apt_upgrades_pending{origin="",arch=""} 0' +fi + +echo '# HELP apt_autoremove_pending Apt package pending autoremove.' +echo '# TYPE apt_autoremove_pending gauge' +echo "${autoremove}" + +echo '# HELP apt_orphans Orphan apt package.' +echo '# TYPE apt_orphans gauge' +echo "${orphans}" + +echo '# HELP node_reboot_required Node reboot is required for software updates.' +echo '# TYPE node_reboot_required gauge' +if [[ -f '/run/reboot-required' ]] ; then + echo 'node_reboot_required 1' +else + echo 'node_reboot_required 0' +fi diff --git a/roles/prometheus-node-exporter/handlers/main.yml b/roles/prometheus-node-exporter/handlers/main.yml new file mode 100644 index 0000000..b4b64a4 --- /dev/null +++ b/roles/prometheus-node-exporter/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart prometheus-node-exporter + service: + name: prometheus-node-exporter + state: restarted diff --git a/roles/prometheus-node-exporter/tasks/main.yml b/roles/prometheus-node-exporter/tasks/main.yml new file mode 100644 index 0000000..bdb43fc --- /dev/null +++ b/roles/prometheus-node-exporter/tasks/main.yml @@ -0,0 +1,45 @@ +--- +- name: Install Prometheus node-exporter + apt: + update_cache: true + name: prometheus-node-exporter + install_recommends: false # Do not install smartmontools + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Install Prometheus node-exporter-collectors (bullseye) + apt: + update_cache: true + name: prometheus-node-exporter-collectors + install_recommends: false + register: apt_result + retries: 3 + until: apt_result is succeeded + when: + - ansible_lsb.codename == 'bullseye' + +- name: Make Prometheus node-exporter listen on adm only + lineinfile: + path: /etc/default/prometheus-node-exporter + regexp: '^ARGS=' + line: | + ARGS="--web.listen-address={{ prometheus_node_exporter.listen_addr }}:9100" + tags: restart-node-exporter + +- name: Activate prometheus-node-exporter service + systemd: + name: prometheus-node-exporter + enabled: true + state: started + +# Install new APT textfile collector, it might be upstreamed one day +# https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/pull/35 +- name: Patch APT textfile collector + copy: + src: apt.sh + dest: /usr/share/prometheus-node-exporter/apt.sh + owner: root + group: root + mode: 0755 + when: ansible_distribution_release != "bullseye" diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml new file mode 100644 index 0000000..4214def --- /dev/null +++ b/roles/prometheus/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart Prometheus + service: + name: prometheus + state: restarted diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml new file mode 100644 index 0000000..2a9f54f --- /dev/null +++ b/roles/prometheus/tasks/main.yml @@ -0,0 +1,42 @@ +--- +- name: Install Prometheus + apt: + update_cache: true + name: prometheus + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Configure Prometheus + template: + src: prometheus/prometheus.yml.j2 + dest: /etc/prometheus/prometheus.yml + mode: 0644 + notify: Restart Prometheus + +- name: Configure Prometheus alert rules + template: + src: prometheus/alert.rules.yml.j2 + dest: /etc/prometheus/alert.rules.yml + mode: 0644 + notify: Restart Prometheus + +# We don't need to restart Prometheus when updating nodes +- name: Configure Prometheus targets + copy: + content: "{{ [{'targets': item.value.targets}] | to_nice_json }}\n" + dest: "/etc/prometheus/{{ item.value.file }}" + mode: 0644 + loop: "{{ prometheus | dict2items }}" + +- name: Activate prometheus service + systemd: + name: prometheus + enabled: true + state: started + +- name: Indicate role in motd + template: + src: update-motd.d/05-service.j2 + dest: /etc/update-motd.d/05-prometheus + mode: 0755 diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 new file mode 100644 index 0000000..afc37b6 --- /dev/null +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -0,0 +1,187 @@ +{{ ansible_header | comment }} +{# As this is also Jinja2 it will conflict without a raw block #} +{# Depending of Prometheus Node exporter version, rules can change depending of version #} +{% raw %} +groups: +- name: alert.rules + rules: + + # Alert for any instance that is unreachable for >3 minutes. + - alert: InstanceDown + expr: up == 0 + for: 3m + labels: + severity: critical + annotations: + summary: "{{ $labels.instance }} ({{ $labels.job }}) est invisible depuis plus de 3 minutes !" + + # Alert for out of memory + # Do not take into account memory not used by apps + - alert: OutOfMemory + expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Mémoire libre de {{ $labels.instance }} à {{ $value }}%." + + # Alert for out of disk space + - alert: OutOfDiskSpace + expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value }}%." + + # Alert for out of inode space on disk + - alert: OutOfInodes + expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." + + # Alert for high CPU usage + - alert: CpuBusy + expr: node_load5 > 9 + for: 10m + labels: + severity: warning + annotations: + summary: "Charge sur {{ $labels.instance }} à {{ $value }}." + + # Check mdadm software RAID + - alert: SoftwareRAIDDegraded + expr: node_md_disks-node_md_disks_active > 0 + for: 3m + labels: + severity: warning + annotations: + summary: "Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)." + + # Check systemd unit (> buster) + - alert: SystemdServiceFailed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" + + # Check UPS + - alert: UpsOutputSourceChanged + expr: upsOutputSource != 3 + for: 5m + labels: + severity: warning + annotations: + summary: "La source d'alimentation de {{ $labels.instance }} a changé !" + description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + + - alert: UpsBatteryStatusChanged + expr: upsBatteryStatus != 2 + for: 5m + labels: + severity: warning + annotations: + summary: "L'état de la batterie de {{ $labels.instance }} a changé !" + description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + + - alert: UpsTemperatureWarning + expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26) + for: 5m + labels: + severity: warning + annotations: + summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C." + description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + + - alert: UpsTemperatureCritical + expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30) + for: 5m + labels: + severity: critical + annotations: + summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C !" + description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + + - alert: UpsHighHumidity + expr: xupsEnvRemoteHumidity > 65 + for: 5m + labels: + severity: warning + annotations: + summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}%." + description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + + - alert: UpsVeryHighHumidity + expr: xupsEnvRemoteHumidity > 85 + for: 5m + labels: + severity: critical + annotations: + summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}% !" + description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + + - alert: UpsHighLoad + expr: upsOutputPercentLoad > 70 + for: 5m + labels: + severity: critical + annotations: + summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" + description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + + - alert: UpsWrongInputVoltage + expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) + for: 5m + labels: + severity: warning + annotations: + summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." + description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + + - alert: UpsWrongOutputVoltage + expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245) + for: 5m + labels: + severity: warning + annotations: + summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." + description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + + - alert: AptAutoremovePending + expr: apt_autoremove_pending > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}." + + - alert: MailqNotEmpty + expr: postfix_mailq_length > 25 + for: 1m + labels: + severity: warning + annotations: + summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}." + + - alert: NoRadiusLogin + expr: rate(radiusd_access_ok[3m]) == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Personne ne vient taper le RADIUS." + + - alert: TooManyReallocatedSectors + expr: smartmon_reallocated_sector_ct_raw_value > 1e3 + for: 5m + labels: + severity: warning + annotations: + summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués." + +{% endraw %} diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 new file mode 100644 index 0000000..daa136c --- /dev/null +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -0,0 +1,42 @@ +{{ ansible_header | comment }} + +global: + # scrape_interval is set to the global default (60s) + # evaluation_interval is set to the global default (60s) + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'example' + +# Alertmanager configuration +# Use prometheus alertmanager installed on the same machine +alerting: + alertmanagers: + - static_configs: + - targets: ['localhost:9093'] + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + - "alert.rules.yml" # Monitoring alerts, this is the file you may be searching! + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +{{ + { + "scrape_configs": + [ + { + "job_name": "prometheus", + "static_configs" : [ + { + "targets": [ + "localhost:9090" + ] + } + ] + } + ] + (prometheus | json_query("*.config[0]")) + } | to_nice_yaml(indent=2) +}} diff --git a/roles/prometheus/templates/update-motd.d/05-service.j2 b/roles/prometheus/templates/update-motd.d/05-service.j2 new file mode 100755 index 0000000..41969b9 --- /dev/null +++ b/roles/prometheus/templates/update-motd.d/05-service.j2 @@ -0,0 +1,3 @@ +#!/usr/bin/tail +14 +{{ ansible_header | comment }} +> prometheus a été déployé sur cette machine. Voir /etc/prometheus/.