Add monitoring

Signed-off-by: Yohann D'ANELLO <ynerant@crans.org>
2021-06-08 16:59:10 +02:00
parent f7de61b6e2
commit 2a5f6621b6
31 changed files with 895 additions and 0 deletions
--- a/group_vars/all/prometheus_node_exporter.yaml
+++ b/group_vars/all/prometheus_node_exporter.yaml
@@ -0,0 +1,3 @@
 ---
 glob_prometheus_node_exporter:
  listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}"
--- a/group_vars/grafana.yml
+++ b/group_vars/grafana.yml
@@ -0,0 +1,7 @@
 ---
 glob_grafana:
  root_url: https://grafana.ynerant.fr
  icon: crans_icon_white.svg
  ldap_base: "{{ glob_ldap.base }}"
  ldap_master_ipv4: "{{ glob_ldap.servers[0] }}"
  ldap_user_tree: "ou=passwd,{{ glob_ldap.base }}"
--- a/group_vars/nginx.yml
+++ b/group_vars/nginx.yml
@@ -30,3 +30,6 @@ glob_nginx:
    - "172.16.0.0/16"
    - "fd00:0:0:42::/64"
  deploy_robots_file: false
 glob_prometheus_nginx_exporter:
  listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}"
--- a/group_vars/prometheus.yml
+++ b/group_vars/prometheus.yml
@@ -0,0 +1,9 @@
 ---
 glob_prometheus: {}
 glob_ninjabot:
  config:
    nick: templier
    server: irc.crans.org
    port: 6667
    channel: "#/dev/null"
--- a/host_vars/monitoring.adm.ynerant.fr.yml
+++ b/host_vars/monitoring.adm.ynerant.fr.yml
@@ -2,3 +2,69 @@
 interfaces:
  adm: eth0
  srv_nat: eth1
 loc_prometheus:
  node:
    file: targets_node.json
    targets: "{{ groups['server'] | select('match', '^.*\\.adm\\.ynerant\\.fr$')  | list | sort }}"
    config:
      - job_name: servers
        file_sd_configs:
          - files:
            - '/etc/prometheus/targets_node.json'
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - source_labels: [__param_target]
            target_label: __address__
            replacement: '$1:9100'
  nginx:
    file: targets_nginx.json
    targets:
      - proxy.adm.ynerant.fr
    config:
      - job_name: nginx
        file_sd_configs:
          - files:
            - '/etc/prometheus/targets_nginx.json'
        relabel_configs:
          - source_labels: [__address__]
            target_label: instance
          - source_labels: [instance]
            target_label: __address__
            replacement: '$1:9117'
  blackbox:
    file: targets_blackbox.json
    targets:
      - https://ynerant.fr/
      - https://bibliogram.ynerant.fr/
      - https://element.ynerant.fr/
      - https://gitea.ynerant.fr/
      - https://grafana.ynerant.fr/
      - https://hydrogen.ynerant.fr/
      - https://nextcloud.ynerant.fr/
      - https://mailu.ynerant.fr/
      - http://notls.ynerant.fr/
      - https://reddit.ynerant.fr/
      - https://thelounge.ynerant.fr/
      - https://translate.ynerant.fr/
      - https://kfet.saperlistpopette.fr/
    config:
      - job_name: blackbox
        file_sd_configs:
          - files:
            - '/etc/prometheus/targets_blackbox.json'
        metrics_path: /probe
        params:
          module: [http_2xx]  # Look for a HTTP 200 response.
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: 127.0.0.1:9115
--- a/9
+++ b/9
@@ -10,12 +10,18 @@ babel4.adm.ynerant.fr
 babel5.adm.ynerant.fr
 babel6.adm.ynerant.fr
 [blackbox]
 monitoring.adm.ynerant.fr
 [certbot]
 proxy.adm.ynerant.fr
 [debian:children]
 server
 [grafana]
 monitoring.adm.ynerant.fr
 [nginx:children]
 reverseproxy
@@ -30,6 +36,9 @@ localhost
 [postfix]
 mailu.adm.ynerant.fr
 [prometheus]
 monitoring.adm.ynerant.fr
 [reverseproxy]
 proxy.adm.ynerant.fr
--- a/plays/monitoring.yml
+++ b/plays/monitoring.yml
@@ -0,0 +1,38 @@
 #!/usr/bin/env ansible-playbook
 ---
 # Deploy Prometheus on monitoring server
 - hosts: prometheus
  vars:
    prometheus: "{{ glob_prometheus | default({}) | combine(loc_prometheus | default({})) }}"
    alertmanager: "{{ glob_alertmanager | default({}) | combine(loc_alertmanager | default({})) }}"
    ninjabot: "{{ glob_ninjabot | default({}) | combine(loc_ninjabot | default({})) }}"
  roles:
    - prometheus
    - prometheus-alertmanager
    - ninjabot
 # Deploy Grafana on monitoring server
 - hosts: grafana
  vars:
    grafana: "{{ glob_grafana | default({}) | combine(loc_grafana | default({})) }}"
  roles:
    - grafana
 - hosts: blackbox
  roles:
    - prometheus-blackbox-exporter
 # Monitor all hosts
 - hosts: server
  vars:
    prometheus_node_exporter: "{{ glob_prometheus_node_exporter | default({}) | combine(loc_prometheus_node_exporter | default({})) }}"
  roles:
    - prometheus-node-exporter
 # Export nginx metrics
 - hosts: nginx
  vars:
    prometheus_nginx_exporter: "{{ glob_prometheus_nginx_exporter | default({}) | combine(loc_prometheus_nginx_exporter | default({})) }}"
  roles:
    - prometheus-nginx-exporter
--- a/roles/grafana/handlers/main.yml
+++ b/roles/grafana/handlers/main.yml
@@ -0,0 +1,5 @@
 ---
 - name: Restart grafana
  service:
    name: grafana-server
    state: restarted
--- a/roles/grafana/tasks/main.yml
+++ b/roles/grafana/tasks/main.yml
@@ -0,0 +1,100 @@
 ---
 - name: Install GPG
  apt:
    name: gnupg
    state: present
  register: apt_result
  retries: 3
  until: apt_result is succeeded
 - name: Import Grafana GPG signing key
  apt_key:
    url: https://packages.grafana.com/gpg.key
    state: present
    validate_certs: false
  register: apt_key_result
  retries: 3
  until: apt_key_result is succeeded
 - name: Add Grafana repository
  apt_repository:
    repo: deb http://mirror.adm.ynerant.fr/grafana/oss/deb stable main
    state: present
    update_cache: true
 - name: Install Grafana
  apt:
    name: grafana
    state: present
  register: apt_result
  retries: 3
  until: apt_result is succeeded
 - name: Configure Grafana
  ini_file:
    path: /etc/grafana/grafana.ini
    section: "{{ item.section }}"
    option: "{{ item.option }}"
    value: "{{ item.value }}"
    mode: 0640
  loop:
    - section: server
      option: root_url
      value: "{{ grafana.root_url }}"
    - section: analytics
      option: reporting_enabled
      value: "false"
    - section: analytics
      option: check_for_updates
      value: "false"
    - section: security
      option: disable_initial_admin_creation
      value: "true"
    - section: security
      option: cookie_secure
      value: "true"
    - section: snapshots
      option: external_enabled
      value: "false"
    - section: users
      option: allow_sign_up
      value: "false"
    - section: users
      option: allow_org_create
      value: "false"
    - section: auth.anonymous
      option: enabled
      value: "true"
    - section: auth.anonymous
      option: hide_version
      value: "true"
    - section: auth.basic  # Only LDAP auth
      option: enabled
      value: "false"
    - section: auth.ldap
      option: enabled
      value: "true"
    - section: alerting
      option: enabled
      value: "false"
  notify: Restart grafana
 - name: Configure Grafana LDAP
  template:
    src: ldap.toml.j2
    dest: /etc/grafana/ldap.toml
    mode: 0640
  notify: Restart grafana
 - name: Enable and start Grafana
  systemd:
    name: grafana-server
    enabled: true
    state: started
    daemon_reload: true
 - name: Indicate role in motd
  template:
    src: update-motd.d/05-service.j2
    dest: /etc/update-motd.d/05-grafana
    mode: 0755
--- a/roles/grafana/templates/ldap.toml.j2
+++ b/roles/grafana/templates/ldap.toml.j2
@@ -0,0 +1,47 @@
 {{ ansible_header | comment }}
 # To troubleshoot and get more log info enable ldap debug logging in grafana.ini
 # [log]
 # filters = ldap:debug
 [[servers]]
 # Ldap server host (specify multiple hosts space separated)
 host = "{{ grafana.ldap_master_ipv4 }}"
 # Default port is 389 or 636 if use_ssl = true
 port = 636
 # Set to true if ldap server supports TLS
 use_ssl = true
 # Set to true if connect ldap server with STARTTLS pattern (create connection in insecure, then upgrade to secure connection with TLS)
 start_tls = false
 # set to true if you want to skip ssl cert validation
 ssl_skip_verify = true
 # set to the path to your root CA certificate or leave unset to use system defaults
 # root_ca_cert = "/path/to/certificate.crt"
 # Authentication against LDAP servers requiring client certificates
 # client_cert = "/path/to/client.crt"
 # client_key = "/path/to/client.key"
 # Use direct bind
 bind_dn = "uid=%s,{{ grafana.ldap_user_tree }}"
 # Useless as we are doing direct bind,
 # but without LDAP auth hang
 search_filter = "(uid=%s)"
 search_base_dns = ["ou=passwd,dc=ynerant,dc=fr"]
 ## For Posix or LDAP setups that does not support member_of attribute you can define the below settings
 ## Please check grafana LDAP docs for examples
 group_search_filter = "(&(objectClass=posixGroup)(memberUid=%s))"
 group_search_base_dns = ["ou=group,{{ grafana.ldap_base }}"]
 group_search_filter_user_attribute = "cn"
 # Specify names of the ldap attributes your ldap uses
 [servers.attributes]
 name = "givenName"
 surname = "sn"
 username = "uid"
 email =  "mail"
 # All LDAP members can edit
 [[servers.group_mappings]]
 group_dn = "*"
 org_role = "Admin"
--- a/roles/grafana/templates/update-motd.d/05-service.j2
+++ b/roles/grafana/templates/update-motd.d/05-service.j2
@@ -0,0 +1,3 @@
 #!/usr/bin/tail +14
 {{ ansible_header | comment }}
 [0m> [38;5;82mgrafana[0m a été déployé sur cette machine. Voir [38;5;6m/etc/grafana/[0m.
--- a/roles/ninjabot/tasks/main.yml
+++ b/roles/ninjabot/tasks/main.yml
@@ -0,0 +1,48 @@
 ---
 - name: Install python3 IRC library
  apt:
    name: python3-irc
    state: present
    update_cache: true
  register: apt_result
  retries: 3
  until: apt_result is succeeded
 - name: Install Flask for python3
  apt:
    name: python3-flask
    state: present
    update_cache: true
  register: apt_result
  retries: 3
  until: apt_result is succeeded
 - name: Clone NinjaBot code
  git:
    repo: https://gitlab.crans.org/nounous/NinjaBot.git
    dest: /var/local/ninjabot
    version: master
 - name: Deploy NinjaBot configuration
  template:
    src: ninjabot/ninjabot.json.j2
    dest: /var/local/ninjabot/ninjabot.json
 - name: Deploy NinjaBot systemd unit
  template:
    src: systemd/system/ninjabot.service.j2
    dest: /etc/systemd/system/ninjabot.service
    mode: 0644
 - name: Load and activate NinjaBot service
  systemd:
    name: ninjabot
    daemon_reload: true
    enabled: true
    state: started
 - name: Indicate NinjaBot in motd
  template:
    src: update-motd.d/05-service.j2
    dest: /etc/update-motd.d/05-ninjabot
    mode: 0755
--- a/roles/ninjabot/templates/ninjabot/ninjabot.json.j2
+++ b/roles/ninjabot/templates/ninjabot/ninjabot.json.j2
@@ -0,0 +1 @@
 {{ ninjabot.config | to_nice_json(indent=2) }}
--- a/roles/ninjabot/templates/systemd/system/ninjabot.service.j2
+++ b/roles/ninjabot/templates/systemd/system/ninjabot.service.j2
@@ -0,0 +1,15 @@
 {{ ansible_header | comment }}
 [Unit]
 Description=NinjaBot IRC bot
 After=network.target
 [Service]
 Type=simple
 WorkingDirectory=/var/local/ninjabot
 User=nobody
 Group=nogroup
 ExecStart=/usr/bin/python3 /var/local/ninjabot/ninjabot.py
 Restart=always
 [Install]
 WantedBy=multi-user.target
--- a/roles/ninjabot/templates/update-motd.d/05-service.j2
+++ b/roles/ninjabot/templates/update-motd.d/05-service.j2
@@ -0,0 +1,3 @@
 #!/usr/bin/tail +14
 {{ ansible_header | comment }}
 [0m> [38;5;82mNinjaBot[0m a été déployé sur cette machine. Voir [38;5;6m/var/local/ninjabot/[0m.
--- a/roles/prometheus-alertmanager/handlers/main.yml
+++ b/roles/prometheus-alertmanager/handlers/main.yml
@@ -0,0 +1,5 @@
 ---
 - name: Restart Prometheus Alertmanager
  service:
    name: prometheus-alertmanager
    state: restarted
--- a/roles/prometheus-alertmanager/tasks/main.yml
+++ b/roles/prometheus-alertmanager/tasks/main.yml
@@ -0,0 +1,14 @@
 ---
 - name: Install Prometheus Alertmanager
  apt:
    update_cache: true
    name: prometheus-alertmanager
  register: apt_result
  retries: 3
  until: apt_result is succeeded
 - name: Configure Prometheus Alertmanager
  template:
    src: prometheus/alertmanager.yml.j2
    dest: /etc/prometheus/alertmanager.yml
  notify: Restart Prometheus Alertmanager
--- a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2
+++ b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2
@@ -0,0 +1,60 @@
 {{ ansible_header | comment }}
 # See https://prometheus.io/docs/alerting/configuration/ for documentation.
 global:
  # The smarthost and SMTP sender used for mail notifications.
  smtp_smarthost: 'localhost:25'
  smtp_from: 'alertmanager@example.org'
  #smtp_auth_username: 'alertmanager'
  #smtp_auth_password: 'password'
 # The directory from which notification templates are read.
 templates: 
 - '/etc/prometheus/alertmanager_templates/*.tmpl'
 # The root route on which each incoming alert enters.
 route:
  # The labels by which incoming alerts are grouped together. For example,
  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
  # be batched into a single group.
  group_by: ['instance']  # group per instance
  # When a new group of alerts is created by an incoming alert, wait at
  # least 'group_wait' to send the initial notification.
  # This way ensures that you get multiple alerts for the same group that start
  # firing shortly after another are batched together on the first 
  # notification.
  group_wait: 30s
  # When the first notification was sent, wait 'group_interval' to send a batch
  # of new alerts that started firing for that group.
  group_interval: 5m
  # If an alert has successfully been sent, wait 'repeat_interval' to
  # resend them.
  repeat_interval: 24h
  # A default receiver
  receiver: webhook-ninjabot
 # Inhibition rules allow to mute a set of alerts given that another alert is
 # firing.
 # We use this to mute any warning-level notifications if the same alert is 
 # already critical.
 inhibit_rules:
 - source_match:
    severity: 'critical'
  target_match:
    severity: 'warning'
  # Apply inhibition if the alertname is the same.
  equal: ['alertname', 'cluster', 'service']
 receivers:
 - name: 'webhook-ninjabot'
  webhook_configs:
  - url: 'http://localhost:5000/'
    send_resolved: true
  - url: 'http://localhost:8000/'
    send_resolved: true
--- a/roles/prometheus-blackbox-exporter/handlers/main.yml
+++ b/roles/prometheus-blackbox-exporter/handlers/main.yml
@@ -0,0 +1,5 @@
 ---
 - name: Restart prometheus-blackbox-exporter
  service:
    name: prometheus-blackbox-exporter
    state: restarted
--- a/roles/prometheus-blackbox-exporter/tasks/main.yml
+++ b/roles/prometheus-blackbox-exporter/tasks/main.yml
@@ -0,0 +1,23 @@
 ---
 - name: Install Prometheus Blackbox exporter
  apt:
    update_cache: true
    name: prometheus-blackbox-exporter
  register: apt_result
  retries: 3
  until: apt_result is succeeded
 - name: Make Prometheus Blackbox exporter listen on localhost only
  lineinfile:
    path: /etc/default/prometheus-blackbox-exporter
    regexp: '^ARGS='
    line: >
      ARGS='--config.file /etc/prometheus/blackbox.yml
      --web.listen-address="localhost:9115"'
  notify: Restart prometheus-blackbox-exporter
 - name: Activate prometheus Blackbox exporter service
  systemd:
    name: prometheus-blackbox-exporter
    enabled: true
    state: started
--- a/roles/prometheus-nginx-exporter/handlers/main.yml
+++ b/roles/prometheus-nginx-exporter/handlers/main.yml
@@ -0,0 +1,10 @@
 ---
 - name: Restart nginx
  service:
    name: nginx
    state: restarted
 - name: Restart prometheus-nginx-exporter
  service:
    name: prometheus-nginx-exporter
    state: restarted
--- a/roles/prometheus-nginx-exporter/tasks/main.yml
+++ b/roles/prometheus-nginx-exporter/tasks/main.yml
@@ -0,0 +1,33 @@
 ---
 - name: Install prometheus-nginx-exporter
  apt:
    update_cache: true
    name:
      - nginx  # Nginx may be not already installed
      - prometheus-nginx-exporter
  register: apt_result
  retries: 3
  until: apt_result is succeeded
 - name: Make prometheus-nginx-exporter listen on adm only
  lineinfile:
    path: /etc/default/prometheus-nginx-exporter
    regexp: '^ARGS='
    line: |
      ARGS="-web.listen-address={{ prometheus_nginx_exporter.listen_addr }}:9117 -nginx.scrape-uri=http://[::1]:6424/stub_status"
  notify:
    - Restart nginx
    - Restart prometheus-nginx-exporter
 - name: Configure nginx
  template:
    src: nginx/status.j2
    dest: /etc/nginx/sites-available/status
  notify: Restart nginx
 - name: Activate nginx site
  file:
    src: /etc/nginx/sites-available/status
    dest: /etc/nginx/sites-enabled/status
    state: link
  notify: Restart nginx
--- a/roles/prometheus-nginx-exporter/templates/nginx/status.j2
+++ b/roles/prometheus-nginx-exporter/templates/nginx/status.j2
@@ -0,0 +1,8 @@
 {{ ansible_header | comment }}
 server {
    listen [::1]:6424;
    location = /stub_status {
        stub_status;
    }
 }
--- a/roles/prometheus-node-exporter/files/apt.sh
+++ b/roles/prometheus-node-exporter/files/apt.sh
@@ -0,0 +1,51 @@
 #!/bin/bash
 #
 # Description: Expose metrics from apt updates.
 #
 # Author: Ben Kochie <superq@gmail.com>
 upgrades="$(/usr/bin/apt-get --just-print dist-upgrade \
  | /usr/bin/awk -F'[()]' \
      '/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2);
                 sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \
  | /usr/bin/sort \
  | /usr/bin/uniq -c \
  | awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/\"/, "\\\"", $2);
           gsub(/\[/, "", $3); gsub(/\]/, "", $3);
           print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $NF "\"} " $1}'
 )"
 autoremove="$(/usr/bin/apt-get --just-print autoremove \
  | /usr/bin/awk '/^Remv/{a++}END{printf "apt_autoremove_pending %d", a}'
 )"
 orphans="$(comm -23 \
  <(dpkg-query -W -f '${db:Status-Abbrev}\t${Package}\n' \
    | grep '^.[^nc]' | cut -f2 | sort) \
  <(apt-cache dumpavail | sed -rn 's/^Package: (.*)/\1/p' | sort -u) \
  | awk 'END{printf "apt_orphans %d", NR}'
 )"
 echo '# HELP apt_upgrades_pending Apt package pending updates by origin.'
 echo '# TYPE apt_upgrades_pending gauge'
 if [[ -n "${upgrades}" ]] ; then
  echo "${upgrades}"
 else
  echo 'apt_upgrades_pending{origin="",arch=""} 0'
 fi
 echo '# HELP apt_autoremove_pending Apt package pending autoremove.'
 echo '# TYPE apt_autoremove_pending gauge'
 echo "${autoremove}"
 echo '# HELP apt_orphans Orphan apt package.'
 echo '# TYPE apt_orphans gauge'
 echo "${orphans}"
 echo '# HELP node_reboot_required Node reboot is required for software updates.'
 echo '# TYPE node_reboot_required gauge'
 if [[ -f '/run/reboot-required' ]] ; then
  echo 'node_reboot_required 1'
 else
  echo 'node_reboot_required 0'
 fi
--- a/roles/prometheus-node-exporter/handlers/main.yml
+++ b/roles/prometheus-node-exporter/handlers/main.yml
@@ -0,0 +1,5 @@
 ---
 - name: Restart prometheus-node-exporter
  service:
    name: prometheus-node-exporter
    state: restarted
--- a/roles/prometheus-node-exporter/tasks/main.yml
+++ b/roles/prometheus-node-exporter/tasks/main.yml
@@ -0,0 +1,45 @@
 ---
 - name: Install Prometheus node-exporter
  apt:
    update_cache: true
    name: prometheus-node-exporter
    install_recommends: false  # Do not install smartmontools
  register: apt_result
  retries: 3
  until: apt_result is succeeded
 - name: Install Prometheus node-exporter-collectors (bullseye)
  apt:
    update_cache: true
    name: prometheus-node-exporter-collectors
    install_recommends: false
  register: apt_result
  retries: 3
  until: apt_result is succeeded
  when:
    - ansible_lsb.codename == 'bullseye'
 - name: Make Prometheus node-exporter listen on adm only
  lineinfile:
    path: /etc/default/prometheus-node-exporter
    regexp: '^ARGS='
    line: |
      ARGS="--web.listen-address={{ prometheus_node_exporter.listen_addr }}:9100"
  tags: restart-node-exporter
 - name: Activate prometheus-node-exporter service
  systemd:
    name: prometheus-node-exporter
    enabled: true
    state: started
 # Install new APT textfile collector, it might be upstreamed one day
 # https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/pull/35
 - name: Patch APT textfile collector
  copy:
    src: apt.sh
    dest: /usr/share/prometheus-node-exporter/apt.sh
    owner: root
    group: root
    mode: 0755
  when: ansible_distribution_release != "bullseye"
--- a/roles/prometheus/handlers/main.yml
+++ b/roles/prometheus/handlers/main.yml
@@ -0,0 +1,5 @@
 ---
 - name: Restart Prometheus
  service:
    name: prometheus
    state: restarted
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@@ -0,0 +1,42 @@
 ---
 - name: Install Prometheus
  apt:
    update_cache: true
    name: prometheus
  register: apt_result
  retries: 3
  until: apt_result is succeeded
 - name: Configure Prometheus
  template:
    src: prometheus/prometheus.yml.j2
    dest: /etc/prometheus/prometheus.yml
    mode: 0644
  notify: Restart Prometheus
 - name: Configure Prometheus alert rules
  template:
    src: prometheus/alert.rules.yml.j2
    dest: /etc/prometheus/alert.rules.yml
    mode: 0644
  notify: Restart Prometheus
 # We don't need to restart Prometheus when updating nodes
 - name: Configure Prometheus targets
  copy:
    content: "{{ [{'targets': item.value.targets}] | to_nice_json }}\n"
    dest: "/etc/prometheus/{{ item.value.file }}"
    mode: 0644
  loop: "{{ prometheus | dict2items }}"
 - name: Activate prometheus service
  systemd:
    name: prometheus
    enabled: true
    state: started
 - name: Indicate role in motd
  template:
    src: update-motd.d/05-service.j2
    dest: /etc/update-motd.d/05-prometheus
    mode: 0755
--- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2
+++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2
@@ -0,0 +1,187 @@
 {{ ansible_header | comment }}
 {# As this is also Jinja2 it will conflict without a raw block #}
 {# Depending of Prometheus Node exporter version, rules can change depending of version #}
 {% raw %}
 groups:
 - name: alert.rules
  rules:
  # Alert for any instance that is unreachable for >3 minutes.
  - alert: InstanceDown
    expr: up == 0
    for: 3m
    labels:
      severity: critical
    annotations:
      summary: "{{ $labels.instance }} ({{ $labels.job }}) est invisible depuis plus de 3 minutes !"
  # Alert for out of memory
  # Do not take into account memory not used by apps
  - alert: OutOfMemory
    expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Mémoire libre de {{ $labels.instance }} à {{ $value }}%."
  # Alert for out of disk space
  - alert: OutOfDiskSpace
    expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value }}%."
  # Alert for out of inode space on disk
  - alert: OutOfInodes
    expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}."
  # Alert for high CPU usage
  - alert: CpuBusy
    expr: node_load5 > 9
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: "Charge sur {{ $labels.instance }} à {{ $value }}."
  # Check mdadm software RAID
  - alert: SoftwareRAIDDegraded
    expr: node_md_disks-node_md_disks_active > 0
    for: 3m
    labels:
      severity: warning
    annotations:
      summary: "Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)."
  # Check systemd unit (> buster)
  - alert: SystemdServiceFailed
    expr: node_systemd_unit_state{state="failed"} == 1
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
  # Check UPS
  - alert: UpsOutputSourceChanged
    expr: upsOutputSource != 3
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "La source d'alimentation de {{ $labels.instance }} a changé !"
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
  - alert: UpsBatteryStatusChanged
    expr: upsBatteryStatus != 2
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "L'état de la batterie de {{ $labels.instance }} a changé !"
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
  - alert: UpsTemperatureWarning
    expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26)
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C."
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
  - alert: UpsTemperatureCritical
    expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30)
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C !"
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
  - alert: UpsHighHumidity
    expr: xupsEnvRemoteHumidity > 65
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}%."
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
  - alert: UpsVeryHighHumidity
    expr: xupsEnvRemoteHumidity > 85
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}% !"
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
  - alert: UpsHighLoad
    expr: upsOutputPercentLoad > 70
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !"
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
  - alert: UpsWrongInputVoltage
    expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V."
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
  - alert: UpsWrongOutputVoltage
    expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V."
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
  - alert: AptAutoremovePending
    expr: apt_autoremove_pending > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}."
  - alert: MailqNotEmpty
    expr: postfix_mailq_length > 25
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}."
  - alert: NoRadiusLogin
    expr: rate(radiusd_access_ok[3m]) == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "Personne ne vient taper le RADIUS."
  - alert: TooManyReallocatedSectors
    expr: smartmon_reallocated_sector_ct_raw_value > 1e3
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués."
 {% endraw %}
--- a/roles/prometheus/templates/prometheus/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2
@@ -0,0 +1,42 @@
 {{ ansible_header | comment }}
 global:
  # scrape_interval is set to the global default (60s)
  # evaluation_interval is set to the global default (60s)
  # scrape_timeout is set to the global default (10s).
  # Attach these labels to any time series or alerts when communicating with
  # external systems (federation, remote storage, Alertmanager).
  external_labels:
      monitor: 'example'
 # Alertmanager configuration
 # Use prometheus alertmanager installed on the same machine
 alerting:
  alertmanagers:
  - static_configs:
    - targets: ['localhost:9093']
 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 rule_files:
  - "alert.rules.yml"  # Monitoring alerts, this is the file you may be searching!
 # A scrape configuration containing exactly one endpoint to scrape:
 # Here it's Prometheus itself.
 {{
  {
    "scrape_configs":
    [
      {
        "job_name": "prometheus",
        "static_configs" : [
          {
            "targets": [
               "localhost:9090"
            ]
          }
        ]
      }
    ] + (prometheus | json_query("*.config[0]"))
  } | to_nice_yaml(indent=2)
 }}
--- a/roles/prometheus/templates/update-motd.d/05-service.j2
+++ b/roles/prometheus/templates/update-motd.d/05-service.j2
@@ -0,0 +1,3 @@
 #!/usr/bin/tail +14
 {{ ansible_header | comment }}
 [0m> [38;5;82mprometheus[0m a été déployé sur cette machine. Voir [38;5;6m/etc/prometheus/[0m.
		`@@ -0,0 +1 @@`
							`{{ ninjabot.config \| to_nice_json(indent=2) }}`