Add monitoring
Signed-off-by: Yohann D'ANELLO <ynerant@crans.org>
This commit is contained in:
parent
f7de61b6e2
commit
2a5f6621b6
3
group_vars/all/prometheus_node_exporter.yaml
Normal file
3
group_vars/all/prometheus_node_exporter.yaml
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
---
|
||||||
|
glob_prometheus_node_exporter:
|
||||||
|
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}"
|
7
group_vars/grafana.yml
Normal file
7
group_vars/grafana.yml
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
glob_grafana:
|
||||||
|
root_url: https://grafana.ynerant.fr
|
||||||
|
icon: crans_icon_white.svg
|
||||||
|
ldap_base: "{{ glob_ldap.base }}"
|
||||||
|
ldap_master_ipv4: "{{ glob_ldap.servers[0] }}"
|
||||||
|
ldap_user_tree: "ou=passwd,{{ glob_ldap.base }}"
|
@ -30,3 +30,6 @@ glob_nginx:
|
|||||||
- "172.16.0.0/16"
|
- "172.16.0.0/16"
|
||||||
- "fd00:0:0:42::/64"
|
- "fd00:0:0:42::/64"
|
||||||
deploy_robots_file: false
|
deploy_robots_file: false
|
||||||
|
|
||||||
|
glob_prometheus_nginx_exporter:
|
||||||
|
listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}"
|
||||||
|
9
group_vars/prometheus.yml
Normal file
9
group_vars/prometheus.yml
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
---
|
||||||
|
glob_prometheus: {}
|
||||||
|
|
||||||
|
glob_ninjabot:
|
||||||
|
config:
|
||||||
|
nick: templier
|
||||||
|
server: irc.crans.org
|
||||||
|
port: 6667
|
||||||
|
channel: "#/dev/null"
|
@ -2,3 +2,69 @@
|
|||||||
interfaces:
|
interfaces:
|
||||||
adm: eth0
|
adm: eth0
|
||||||
srv_nat: eth1
|
srv_nat: eth1
|
||||||
|
|
||||||
|
loc_prometheus:
|
||||||
|
node:
|
||||||
|
file: targets_node.json
|
||||||
|
targets: "{{ groups['server'] | select('match', '^.*\\.adm\\.ynerant\\.fr$') | list | sort }}"
|
||||||
|
config:
|
||||||
|
- job_name: servers
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- '/etc/prometheus/targets_node.json'
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: __address__
|
||||||
|
replacement: '$1:9100'
|
||||||
|
|
||||||
|
nginx:
|
||||||
|
file: targets_nginx.json
|
||||||
|
targets:
|
||||||
|
- proxy.adm.ynerant.fr
|
||||||
|
config:
|
||||||
|
- job_name: nginx
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- '/etc/prometheus/targets_nginx.json'
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: instance
|
||||||
|
- source_labels: [instance]
|
||||||
|
target_label: __address__
|
||||||
|
replacement: '$1:9117'
|
||||||
|
|
||||||
|
blackbox:
|
||||||
|
file: targets_blackbox.json
|
||||||
|
targets:
|
||||||
|
- https://ynerant.fr/
|
||||||
|
- https://bibliogram.ynerant.fr/
|
||||||
|
- https://element.ynerant.fr/
|
||||||
|
- https://gitea.ynerant.fr/
|
||||||
|
- https://grafana.ynerant.fr/
|
||||||
|
- https://hydrogen.ynerant.fr/
|
||||||
|
- https://nextcloud.ynerant.fr/
|
||||||
|
- https://mailu.ynerant.fr/
|
||||||
|
- http://notls.ynerant.fr/
|
||||||
|
- https://reddit.ynerant.fr/
|
||||||
|
- https://thelounge.ynerant.fr/
|
||||||
|
- https://translate.ynerant.fr/
|
||||||
|
- https://kfet.saperlistpopette.fr/
|
||||||
|
config:
|
||||||
|
- job_name: blackbox
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- '/etc/prometheus/targets_blackbox.json'
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [http_2xx] # Look for a HTTP 200 response.
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: 127.0.0.1:9115
|
||||||
|
9
hosts
9
hosts
@ -10,12 +10,18 @@ babel4.adm.ynerant.fr
|
|||||||
babel5.adm.ynerant.fr
|
babel5.adm.ynerant.fr
|
||||||
babel6.adm.ynerant.fr
|
babel6.adm.ynerant.fr
|
||||||
|
|
||||||
|
[blackbox]
|
||||||
|
monitoring.adm.ynerant.fr
|
||||||
|
|
||||||
[certbot]
|
[certbot]
|
||||||
proxy.adm.ynerant.fr
|
proxy.adm.ynerant.fr
|
||||||
|
|
||||||
[debian:children]
|
[debian:children]
|
||||||
server
|
server
|
||||||
|
|
||||||
|
[grafana]
|
||||||
|
monitoring.adm.ynerant.fr
|
||||||
|
|
||||||
[nginx:children]
|
[nginx:children]
|
||||||
reverseproxy
|
reverseproxy
|
||||||
|
|
||||||
@ -30,6 +36,9 @@ localhost
|
|||||||
[postfix]
|
[postfix]
|
||||||
mailu.adm.ynerant.fr
|
mailu.adm.ynerant.fr
|
||||||
|
|
||||||
|
[prometheus]
|
||||||
|
monitoring.adm.ynerant.fr
|
||||||
|
|
||||||
[reverseproxy]
|
[reverseproxy]
|
||||||
proxy.adm.ynerant.fr
|
proxy.adm.ynerant.fr
|
||||||
|
|
||||||
|
38
plays/monitoring.yml
Executable file
38
plays/monitoring.yml
Executable file
@ -0,0 +1,38 @@
|
|||||||
|
#!/usr/bin/env ansible-playbook
|
||||||
|
---
|
||||||
|
|
||||||
|
# Deploy Prometheus on monitoring server
|
||||||
|
- hosts: prometheus
|
||||||
|
vars:
|
||||||
|
prometheus: "{{ glob_prometheus | default({}) | combine(loc_prometheus | default({})) }}"
|
||||||
|
alertmanager: "{{ glob_alertmanager | default({}) | combine(loc_alertmanager | default({})) }}"
|
||||||
|
ninjabot: "{{ glob_ninjabot | default({}) | combine(loc_ninjabot | default({})) }}"
|
||||||
|
roles:
|
||||||
|
- prometheus
|
||||||
|
- prometheus-alertmanager
|
||||||
|
- ninjabot
|
||||||
|
|
||||||
|
# Deploy Grafana on monitoring server
|
||||||
|
- hosts: grafana
|
||||||
|
vars:
|
||||||
|
grafana: "{{ glob_grafana | default({}) | combine(loc_grafana | default({})) }}"
|
||||||
|
roles:
|
||||||
|
- grafana
|
||||||
|
|
||||||
|
- hosts: blackbox
|
||||||
|
roles:
|
||||||
|
- prometheus-blackbox-exporter
|
||||||
|
|
||||||
|
# Monitor all hosts
|
||||||
|
- hosts: server
|
||||||
|
vars:
|
||||||
|
prometheus_node_exporter: "{{ glob_prometheus_node_exporter | default({}) | combine(loc_prometheus_node_exporter | default({})) }}"
|
||||||
|
roles:
|
||||||
|
- prometheus-node-exporter
|
||||||
|
|
||||||
|
# Export nginx metrics
|
||||||
|
- hosts: nginx
|
||||||
|
vars:
|
||||||
|
prometheus_nginx_exporter: "{{ glob_prometheus_nginx_exporter | default({}) | combine(loc_prometheus_nginx_exporter | default({})) }}"
|
||||||
|
roles:
|
||||||
|
- prometheus-nginx-exporter
|
5
roles/grafana/handlers/main.yml
Normal file
5
roles/grafana/handlers/main.yml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
- name: Restart grafana
|
||||||
|
service:
|
||||||
|
name: grafana-server
|
||||||
|
state: restarted
|
100
roles/grafana/tasks/main.yml
Normal file
100
roles/grafana/tasks/main.yml
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
---
|
||||||
|
- name: Install GPG
|
||||||
|
apt:
|
||||||
|
name: gnupg
|
||||||
|
state: present
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Import Grafana GPG signing key
|
||||||
|
apt_key:
|
||||||
|
url: https://packages.grafana.com/gpg.key
|
||||||
|
state: present
|
||||||
|
validate_certs: false
|
||||||
|
register: apt_key_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_key_result is succeeded
|
||||||
|
|
||||||
|
- name: Add Grafana repository
|
||||||
|
apt_repository:
|
||||||
|
repo: deb http://mirror.adm.ynerant.fr/grafana/oss/deb stable main
|
||||||
|
state: present
|
||||||
|
update_cache: true
|
||||||
|
|
||||||
|
- name: Install Grafana
|
||||||
|
apt:
|
||||||
|
name: grafana
|
||||||
|
state: present
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Configure Grafana
|
||||||
|
ini_file:
|
||||||
|
path: /etc/grafana/grafana.ini
|
||||||
|
section: "{{ item.section }}"
|
||||||
|
option: "{{ item.option }}"
|
||||||
|
value: "{{ item.value }}"
|
||||||
|
mode: 0640
|
||||||
|
loop:
|
||||||
|
- section: server
|
||||||
|
option: root_url
|
||||||
|
value: "{{ grafana.root_url }}"
|
||||||
|
- section: analytics
|
||||||
|
option: reporting_enabled
|
||||||
|
value: "false"
|
||||||
|
- section: analytics
|
||||||
|
option: check_for_updates
|
||||||
|
value: "false"
|
||||||
|
- section: security
|
||||||
|
option: disable_initial_admin_creation
|
||||||
|
value: "true"
|
||||||
|
- section: security
|
||||||
|
option: cookie_secure
|
||||||
|
value: "true"
|
||||||
|
- section: snapshots
|
||||||
|
option: external_enabled
|
||||||
|
value: "false"
|
||||||
|
- section: users
|
||||||
|
option: allow_sign_up
|
||||||
|
value: "false"
|
||||||
|
- section: users
|
||||||
|
option: allow_org_create
|
||||||
|
value: "false"
|
||||||
|
- section: auth.anonymous
|
||||||
|
option: enabled
|
||||||
|
value: "true"
|
||||||
|
- section: auth.anonymous
|
||||||
|
option: hide_version
|
||||||
|
value: "true"
|
||||||
|
- section: auth.basic # Only LDAP auth
|
||||||
|
option: enabled
|
||||||
|
value: "false"
|
||||||
|
- section: auth.ldap
|
||||||
|
option: enabled
|
||||||
|
value: "true"
|
||||||
|
- section: alerting
|
||||||
|
option: enabled
|
||||||
|
value: "false"
|
||||||
|
notify: Restart grafana
|
||||||
|
|
||||||
|
- name: Configure Grafana LDAP
|
||||||
|
template:
|
||||||
|
src: ldap.toml.j2
|
||||||
|
dest: /etc/grafana/ldap.toml
|
||||||
|
mode: 0640
|
||||||
|
notify: Restart grafana
|
||||||
|
|
||||||
|
- name: Enable and start Grafana
|
||||||
|
systemd:
|
||||||
|
name: grafana-server
|
||||||
|
enabled: true
|
||||||
|
state: started
|
||||||
|
daemon_reload: true
|
||||||
|
|
||||||
|
- name: Indicate role in motd
|
||||||
|
template:
|
||||||
|
src: update-motd.d/05-service.j2
|
||||||
|
dest: /etc/update-motd.d/05-grafana
|
||||||
|
mode: 0755
|
47
roles/grafana/templates/ldap.toml.j2
Normal file
47
roles/grafana/templates/ldap.toml.j2
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
{{ ansible_header | comment }}
|
||||||
|
# To troubleshoot and get more log info enable ldap debug logging in grafana.ini
|
||||||
|
# [log]
|
||||||
|
# filters = ldap:debug
|
||||||
|
|
||||||
|
[[servers]]
|
||||||
|
# Ldap server host (specify multiple hosts space separated)
|
||||||
|
host = "{{ grafana.ldap_master_ipv4 }}"
|
||||||
|
# Default port is 389 or 636 if use_ssl = true
|
||||||
|
port = 636
|
||||||
|
# Set to true if ldap server supports TLS
|
||||||
|
use_ssl = true
|
||||||
|
# Set to true if connect ldap server with STARTTLS pattern (create connection in insecure, then upgrade to secure connection with TLS)
|
||||||
|
start_tls = false
|
||||||
|
# set to true if you want to skip ssl cert validation
|
||||||
|
ssl_skip_verify = true
|
||||||
|
# set to the path to your root CA certificate or leave unset to use system defaults
|
||||||
|
# root_ca_cert = "/path/to/certificate.crt"
|
||||||
|
# Authentication against LDAP servers requiring client certificates
|
||||||
|
# client_cert = "/path/to/client.crt"
|
||||||
|
# client_key = "/path/to/client.key"
|
||||||
|
|
||||||
|
# Use direct bind
|
||||||
|
bind_dn = "uid=%s,{{ grafana.ldap_user_tree }}"
|
||||||
|
|
||||||
|
# Useless as we are doing direct bind,
|
||||||
|
# but without LDAP auth hang
|
||||||
|
search_filter = "(uid=%s)"
|
||||||
|
search_base_dns = ["ou=passwd,dc=ynerant,dc=fr"]
|
||||||
|
|
||||||
|
## For Posix or LDAP setups that does not support member_of attribute you can define the below settings
|
||||||
|
## Please check grafana LDAP docs for examples
|
||||||
|
group_search_filter = "(&(objectClass=posixGroup)(memberUid=%s))"
|
||||||
|
group_search_base_dns = ["ou=group,{{ grafana.ldap_base }}"]
|
||||||
|
group_search_filter_user_attribute = "cn"
|
||||||
|
|
||||||
|
# Specify names of the ldap attributes your ldap uses
|
||||||
|
[servers.attributes]
|
||||||
|
name = "givenName"
|
||||||
|
surname = "sn"
|
||||||
|
username = "uid"
|
||||||
|
email = "mail"
|
||||||
|
|
||||||
|
# All LDAP members can edit
|
||||||
|
[[servers.group_mappings]]
|
||||||
|
group_dn = "*"
|
||||||
|
org_role = "Admin"
|
3
roles/grafana/templates/update-motd.d/05-service.j2
Executable file
3
roles/grafana/templates/update-motd.d/05-service.j2
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/usr/bin/tail +14
|
||||||
|
{{ ansible_header | comment }}
|
||||||
|
[0m> [38;5;82mgrafana[0m a été déployé sur cette machine. Voir [38;5;6m/etc/grafana/[0m.
|
48
roles/ninjabot/tasks/main.yml
Normal file
48
roles/ninjabot/tasks/main.yml
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
---
|
||||||
|
- name: Install python3 IRC library
|
||||||
|
apt:
|
||||||
|
name: python3-irc
|
||||||
|
state: present
|
||||||
|
update_cache: true
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Install Flask for python3
|
||||||
|
apt:
|
||||||
|
name: python3-flask
|
||||||
|
state: present
|
||||||
|
update_cache: true
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Clone NinjaBot code
|
||||||
|
git:
|
||||||
|
repo: https://gitlab.crans.org/nounous/NinjaBot.git
|
||||||
|
dest: /var/local/ninjabot
|
||||||
|
version: master
|
||||||
|
|
||||||
|
- name: Deploy NinjaBot configuration
|
||||||
|
template:
|
||||||
|
src: ninjabot/ninjabot.json.j2
|
||||||
|
dest: /var/local/ninjabot/ninjabot.json
|
||||||
|
|
||||||
|
- name: Deploy NinjaBot systemd unit
|
||||||
|
template:
|
||||||
|
src: systemd/system/ninjabot.service.j2
|
||||||
|
dest: /etc/systemd/system/ninjabot.service
|
||||||
|
mode: 0644
|
||||||
|
|
||||||
|
- name: Load and activate NinjaBot service
|
||||||
|
systemd:
|
||||||
|
name: ninjabot
|
||||||
|
daemon_reload: true
|
||||||
|
enabled: true
|
||||||
|
state: started
|
||||||
|
|
||||||
|
- name: Indicate NinjaBot in motd
|
||||||
|
template:
|
||||||
|
src: update-motd.d/05-service.j2
|
||||||
|
dest: /etc/update-motd.d/05-ninjabot
|
||||||
|
mode: 0755
|
1
roles/ninjabot/templates/ninjabot/ninjabot.json.j2
Normal file
1
roles/ninjabot/templates/ninjabot/ninjabot.json.j2
Normal file
@ -0,0 +1 @@
|
|||||||
|
{{ ninjabot.config | to_nice_json(indent=2) }}
|
15
roles/ninjabot/templates/systemd/system/ninjabot.service.j2
Normal file
15
roles/ninjabot/templates/systemd/system/ninjabot.service.j2
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
{{ ansible_header | comment }}
|
||||||
|
[Unit]
|
||||||
|
Description=NinjaBot IRC bot
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
WorkingDirectory=/var/local/ninjabot
|
||||||
|
User=nobody
|
||||||
|
Group=nogroup
|
||||||
|
ExecStart=/usr/bin/python3 /var/local/ninjabot/ninjabot.py
|
||||||
|
Restart=always
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
3
roles/ninjabot/templates/update-motd.d/05-service.j2
Executable file
3
roles/ninjabot/templates/update-motd.d/05-service.j2
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/usr/bin/tail +14
|
||||||
|
{{ ansible_header | comment }}
|
||||||
|
[0m> [38;5;82mNinjaBot[0m a été déployé sur cette machine. Voir [38;5;6m/var/local/ninjabot/[0m.
|
5
roles/prometheus-alertmanager/handlers/main.yml
Normal file
5
roles/prometheus-alertmanager/handlers/main.yml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
- name: Restart Prometheus Alertmanager
|
||||||
|
service:
|
||||||
|
name: prometheus-alertmanager
|
||||||
|
state: restarted
|
14
roles/prometheus-alertmanager/tasks/main.yml
Normal file
14
roles/prometheus-alertmanager/tasks/main.yml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
---
|
||||||
|
- name: Install Prometheus Alertmanager
|
||||||
|
apt:
|
||||||
|
update_cache: true
|
||||||
|
name: prometheus-alertmanager
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Configure Prometheus Alertmanager
|
||||||
|
template:
|
||||||
|
src: prometheus/alertmanager.yml.j2
|
||||||
|
dest: /etc/prometheus/alertmanager.yml
|
||||||
|
notify: Restart Prometheus Alertmanager
|
@ -0,0 +1,60 @@
|
|||||||
|
{{ ansible_header | comment }}
|
||||||
|
# See https://prometheus.io/docs/alerting/configuration/ for documentation.
|
||||||
|
|
||||||
|
global:
|
||||||
|
# The smarthost and SMTP sender used for mail notifications.
|
||||||
|
smtp_smarthost: 'localhost:25'
|
||||||
|
smtp_from: 'alertmanager@example.org'
|
||||||
|
#smtp_auth_username: 'alertmanager'
|
||||||
|
#smtp_auth_password: 'password'
|
||||||
|
|
||||||
|
# The directory from which notification templates are read.
|
||||||
|
templates:
|
||||||
|
- '/etc/prometheus/alertmanager_templates/*.tmpl'
|
||||||
|
|
||||||
|
# The root route on which each incoming alert enters.
|
||||||
|
route:
|
||||||
|
# The labels by which incoming alerts are grouped together. For example,
|
||||||
|
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
||||||
|
# be batched into a single group.
|
||||||
|
group_by: ['instance'] # group per instance
|
||||||
|
|
||||||
|
# When a new group of alerts is created by an incoming alert, wait at
|
||||||
|
# least 'group_wait' to send the initial notification.
|
||||||
|
# This way ensures that you get multiple alerts for the same group that start
|
||||||
|
# firing shortly after another are batched together on the first
|
||||||
|
# notification.
|
||||||
|
group_wait: 30s
|
||||||
|
|
||||||
|
# When the first notification was sent, wait 'group_interval' to send a batch
|
||||||
|
# of new alerts that started firing for that group.
|
||||||
|
group_interval: 5m
|
||||||
|
|
||||||
|
# If an alert has successfully been sent, wait 'repeat_interval' to
|
||||||
|
# resend them.
|
||||||
|
repeat_interval: 24h
|
||||||
|
|
||||||
|
# A default receiver
|
||||||
|
receiver: webhook-ninjabot
|
||||||
|
|
||||||
|
|
||||||
|
# Inhibition rules allow to mute a set of alerts given that another alert is
|
||||||
|
# firing.
|
||||||
|
# We use this to mute any warning-level notifications if the same alert is
|
||||||
|
# already critical.
|
||||||
|
inhibit_rules:
|
||||||
|
- source_match:
|
||||||
|
severity: 'critical'
|
||||||
|
target_match:
|
||||||
|
severity: 'warning'
|
||||||
|
# Apply inhibition if the alertname is the same.
|
||||||
|
equal: ['alertname', 'cluster', 'service']
|
||||||
|
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'webhook-ninjabot'
|
||||||
|
webhook_configs:
|
||||||
|
- url: 'http://localhost:5000/'
|
||||||
|
send_resolved: true
|
||||||
|
- url: 'http://localhost:8000/'
|
||||||
|
send_resolved: true
|
5
roles/prometheus-blackbox-exporter/handlers/main.yml
Normal file
5
roles/prometheus-blackbox-exporter/handlers/main.yml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
- name: Restart prometheus-blackbox-exporter
|
||||||
|
service:
|
||||||
|
name: prometheus-blackbox-exporter
|
||||||
|
state: restarted
|
23
roles/prometheus-blackbox-exporter/tasks/main.yml
Normal file
23
roles/prometheus-blackbox-exporter/tasks/main.yml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
---
|
||||||
|
- name: Install Prometheus Blackbox exporter
|
||||||
|
apt:
|
||||||
|
update_cache: true
|
||||||
|
name: prometheus-blackbox-exporter
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Make Prometheus Blackbox exporter listen on localhost only
|
||||||
|
lineinfile:
|
||||||
|
path: /etc/default/prometheus-blackbox-exporter
|
||||||
|
regexp: '^ARGS='
|
||||||
|
line: >
|
||||||
|
ARGS='--config.file /etc/prometheus/blackbox.yml
|
||||||
|
--web.listen-address="localhost:9115"'
|
||||||
|
notify: Restart prometheus-blackbox-exporter
|
||||||
|
|
||||||
|
- name: Activate prometheus Blackbox exporter service
|
||||||
|
systemd:
|
||||||
|
name: prometheus-blackbox-exporter
|
||||||
|
enabled: true
|
||||||
|
state: started
|
10
roles/prometheus-nginx-exporter/handlers/main.yml
Normal file
10
roles/prometheus-nginx-exporter/handlers/main.yml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
---
|
||||||
|
- name: Restart nginx
|
||||||
|
service:
|
||||||
|
name: nginx
|
||||||
|
state: restarted
|
||||||
|
|
||||||
|
- name: Restart prometheus-nginx-exporter
|
||||||
|
service:
|
||||||
|
name: prometheus-nginx-exporter
|
||||||
|
state: restarted
|
33
roles/prometheus-nginx-exporter/tasks/main.yml
Normal file
33
roles/prometheus-nginx-exporter/tasks/main.yml
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
---
|
||||||
|
- name: Install prometheus-nginx-exporter
|
||||||
|
apt:
|
||||||
|
update_cache: true
|
||||||
|
name:
|
||||||
|
- nginx # Nginx may be not already installed
|
||||||
|
- prometheus-nginx-exporter
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Make prometheus-nginx-exporter listen on adm only
|
||||||
|
lineinfile:
|
||||||
|
path: /etc/default/prometheus-nginx-exporter
|
||||||
|
regexp: '^ARGS='
|
||||||
|
line: |
|
||||||
|
ARGS="-web.listen-address={{ prometheus_nginx_exporter.listen_addr }}:9117 -nginx.scrape-uri=http://[::1]:6424/stub_status"
|
||||||
|
notify:
|
||||||
|
- Restart nginx
|
||||||
|
- Restart prometheus-nginx-exporter
|
||||||
|
|
||||||
|
- name: Configure nginx
|
||||||
|
template:
|
||||||
|
src: nginx/status.j2
|
||||||
|
dest: /etc/nginx/sites-available/status
|
||||||
|
notify: Restart nginx
|
||||||
|
|
||||||
|
- name: Activate nginx site
|
||||||
|
file:
|
||||||
|
src: /etc/nginx/sites-available/status
|
||||||
|
dest: /etc/nginx/sites-enabled/status
|
||||||
|
state: link
|
||||||
|
notify: Restart nginx
|
@ -0,0 +1,8 @@
|
|||||||
|
{{ ansible_header | comment }}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen [::1]:6424;
|
||||||
|
location = /stub_status {
|
||||||
|
stub_status;
|
||||||
|
}
|
||||||
|
}
|
51
roles/prometheus-node-exporter/files/apt.sh
Executable file
51
roles/prometheus-node-exporter/files/apt.sh
Executable file
@ -0,0 +1,51 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Description: Expose metrics from apt updates.
|
||||||
|
#
|
||||||
|
# Author: Ben Kochie <superq@gmail.com>
|
||||||
|
|
||||||
|
upgrades="$(/usr/bin/apt-get --just-print dist-upgrade \
|
||||||
|
| /usr/bin/awk -F'[()]' \
|
||||||
|
'/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2);
|
||||||
|
sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \
|
||||||
|
| /usr/bin/sort \
|
||||||
|
| /usr/bin/uniq -c \
|
||||||
|
| awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/\"/, "\\\"", $2);
|
||||||
|
gsub(/\[/, "", $3); gsub(/\]/, "", $3);
|
||||||
|
print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $NF "\"} " $1}'
|
||||||
|
)"
|
||||||
|
|
||||||
|
autoremove="$(/usr/bin/apt-get --just-print autoremove \
|
||||||
|
| /usr/bin/awk '/^Remv/{a++}END{printf "apt_autoremove_pending %d", a}'
|
||||||
|
)"
|
||||||
|
|
||||||
|
orphans="$(comm -23 \
|
||||||
|
<(dpkg-query -W -f '${db:Status-Abbrev}\t${Package}\n' \
|
||||||
|
| grep '^.[^nc]' | cut -f2 | sort) \
|
||||||
|
<(apt-cache dumpavail | sed -rn 's/^Package: (.*)/\1/p' | sort -u) \
|
||||||
|
| awk 'END{printf "apt_orphans %d", NR}'
|
||||||
|
)"
|
||||||
|
|
||||||
|
echo '# HELP apt_upgrades_pending Apt package pending updates by origin.'
|
||||||
|
echo '# TYPE apt_upgrades_pending gauge'
|
||||||
|
if [[ -n "${upgrades}" ]] ; then
|
||||||
|
echo "${upgrades}"
|
||||||
|
else
|
||||||
|
echo 'apt_upgrades_pending{origin="",arch=""} 0'
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo '# HELP apt_autoremove_pending Apt package pending autoremove.'
|
||||||
|
echo '# TYPE apt_autoremove_pending gauge'
|
||||||
|
echo "${autoremove}"
|
||||||
|
|
||||||
|
echo '# HELP apt_orphans Orphan apt package.'
|
||||||
|
echo '# TYPE apt_orphans gauge'
|
||||||
|
echo "${orphans}"
|
||||||
|
|
||||||
|
echo '# HELP node_reboot_required Node reboot is required for software updates.'
|
||||||
|
echo '# TYPE node_reboot_required gauge'
|
||||||
|
if [[ -f '/run/reboot-required' ]] ; then
|
||||||
|
echo 'node_reboot_required 1'
|
||||||
|
else
|
||||||
|
echo 'node_reboot_required 0'
|
||||||
|
fi
|
5
roles/prometheus-node-exporter/handlers/main.yml
Normal file
5
roles/prometheus-node-exporter/handlers/main.yml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
- name: Restart prometheus-node-exporter
|
||||||
|
service:
|
||||||
|
name: prometheus-node-exporter
|
||||||
|
state: restarted
|
45
roles/prometheus-node-exporter/tasks/main.yml
Normal file
45
roles/prometheus-node-exporter/tasks/main.yml
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
---
|
||||||
|
- name: Install Prometheus node-exporter
|
||||||
|
apt:
|
||||||
|
update_cache: true
|
||||||
|
name: prometheus-node-exporter
|
||||||
|
install_recommends: false # Do not install smartmontools
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Install Prometheus node-exporter-collectors (bullseye)
|
||||||
|
apt:
|
||||||
|
update_cache: true
|
||||||
|
name: prometheus-node-exporter-collectors
|
||||||
|
install_recommends: false
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
when:
|
||||||
|
- ansible_lsb.codename == 'bullseye'
|
||||||
|
|
||||||
|
- name: Make Prometheus node-exporter listen on adm only
|
||||||
|
lineinfile:
|
||||||
|
path: /etc/default/prometheus-node-exporter
|
||||||
|
regexp: '^ARGS='
|
||||||
|
line: |
|
||||||
|
ARGS="--web.listen-address={{ prometheus_node_exporter.listen_addr }}:9100"
|
||||||
|
tags: restart-node-exporter
|
||||||
|
|
||||||
|
- name: Activate prometheus-node-exporter service
|
||||||
|
systemd:
|
||||||
|
name: prometheus-node-exporter
|
||||||
|
enabled: true
|
||||||
|
state: started
|
||||||
|
|
||||||
|
# Install new APT textfile collector, it might be upstreamed one day
|
||||||
|
# https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/pull/35
|
||||||
|
- name: Patch APT textfile collector
|
||||||
|
copy:
|
||||||
|
src: apt.sh
|
||||||
|
dest: /usr/share/prometheus-node-exporter/apt.sh
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: 0755
|
||||||
|
when: ansible_distribution_release != "bullseye"
|
5
roles/prometheus/handlers/main.yml
Normal file
5
roles/prometheus/handlers/main.yml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
- name: Restart Prometheus
|
||||||
|
service:
|
||||||
|
name: prometheus
|
||||||
|
state: restarted
|
42
roles/prometheus/tasks/main.yml
Normal file
42
roles/prometheus/tasks/main.yml
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
---
|
||||||
|
- name: Install Prometheus
|
||||||
|
apt:
|
||||||
|
update_cache: true
|
||||||
|
name: prometheus
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Configure Prometheus
|
||||||
|
template:
|
||||||
|
src: prometheus/prometheus.yml.j2
|
||||||
|
dest: /etc/prometheus/prometheus.yml
|
||||||
|
mode: 0644
|
||||||
|
notify: Restart Prometheus
|
||||||
|
|
||||||
|
- name: Configure Prometheus alert rules
|
||||||
|
template:
|
||||||
|
src: prometheus/alert.rules.yml.j2
|
||||||
|
dest: /etc/prometheus/alert.rules.yml
|
||||||
|
mode: 0644
|
||||||
|
notify: Restart Prometheus
|
||||||
|
|
||||||
|
# We don't need to restart Prometheus when updating nodes
|
||||||
|
- name: Configure Prometheus targets
|
||||||
|
copy:
|
||||||
|
content: "{{ [{'targets': item.value.targets}] | to_nice_json }}\n"
|
||||||
|
dest: "/etc/prometheus/{{ item.value.file }}"
|
||||||
|
mode: 0644
|
||||||
|
loop: "{{ prometheus | dict2items }}"
|
||||||
|
|
||||||
|
- name: Activate prometheus service
|
||||||
|
systemd:
|
||||||
|
name: prometheus
|
||||||
|
enabled: true
|
||||||
|
state: started
|
||||||
|
|
||||||
|
- name: Indicate role in motd
|
||||||
|
template:
|
||||||
|
src: update-motd.d/05-service.j2
|
||||||
|
dest: /etc/update-motd.d/05-prometheus
|
||||||
|
mode: 0755
|
187
roles/prometheus/templates/prometheus/alert.rules.yml.j2
Normal file
187
roles/prometheus/templates/prometheus/alert.rules.yml.j2
Normal file
@ -0,0 +1,187 @@
|
|||||||
|
{{ ansible_header | comment }}
|
||||||
|
{# As this is also Jinja2 it will conflict without a raw block #}
|
||||||
|
{# Depending of Prometheus Node exporter version, rules can change depending of version #}
|
||||||
|
{% raw %}
|
||||||
|
groups:
|
||||||
|
- name: alert.rules
|
||||||
|
rules:
|
||||||
|
|
||||||
|
# Alert for any instance that is unreachable for >3 minutes.
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.instance }} ({{ $labels.job }}) est invisible depuis plus de 3 minutes !"
|
||||||
|
|
||||||
|
# Alert for out of memory
|
||||||
|
# Do not take into account memory not used by apps
|
||||||
|
- alert: OutOfMemory
|
||||||
|
expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Mémoire libre de {{ $labels.instance }} à {{ $value }}%."
|
||||||
|
|
||||||
|
# Alert for out of disk space
|
||||||
|
- alert: OutOfDiskSpace
|
||||||
|
expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value }}%."
|
||||||
|
|
||||||
|
# Alert for out of inode space on disk
|
||||||
|
- alert: OutOfInodes
|
||||||
|
expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}."
|
||||||
|
|
||||||
|
# Alert for high CPU usage
|
||||||
|
- alert: CpuBusy
|
||||||
|
expr: node_load5 > 9
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Charge sur {{ $labels.instance }} à {{ $value }}."
|
||||||
|
|
||||||
|
# Check mdadm software RAID
|
||||||
|
- alert: SoftwareRAIDDegraded
|
||||||
|
expr: node_md_disks-node_md_disks_active > 0
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)."
|
||||||
|
|
||||||
|
# Check systemd unit (> buster)
|
||||||
|
- alert: SystemdServiceFailed
|
||||||
|
expr: node_systemd_unit_state{state="failed"} == 1
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
|
||||||
|
|
||||||
|
# Check UPS
|
||||||
|
- alert: UpsOutputSourceChanged
|
||||||
|
expr: upsOutputSource != 3
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "La source d'alimentation de {{ $labels.instance }} a changé !"
|
||||||
|
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||||
|
|
||||||
|
- alert: UpsBatteryStatusChanged
|
||||||
|
expr: upsBatteryStatus != 2
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "L'état de la batterie de {{ $labels.instance }} a changé !"
|
||||||
|
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||||
|
|
||||||
|
- alert: UpsTemperatureWarning
|
||||||
|
expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C."
|
||||||
|
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||||
|
|
||||||
|
- alert: UpsTemperatureCritical
|
||||||
|
expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C !"
|
||||||
|
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||||
|
|
||||||
|
- alert: UpsHighHumidity
|
||||||
|
expr: xupsEnvRemoteHumidity > 65
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}%."
|
||||||
|
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||||
|
|
||||||
|
- alert: UpsVeryHighHumidity
|
||||||
|
expr: xupsEnvRemoteHumidity > 85
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}% !"
|
||||||
|
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||||
|
|
||||||
|
- alert: UpsHighLoad
|
||||||
|
expr: upsOutputPercentLoad > 70
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !"
|
||||||
|
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||||
|
|
||||||
|
- alert: UpsWrongInputVoltage
|
||||||
|
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V."
|
||||||
|
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||||
|
|
||||||
|
- alert: UpsWrongOutputVoltage
|
||||||
|
expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V."
|
||||||
|
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||||
|
|
||||||
|
- alert: AptAutoremovePending
|
||||||
|
expr: apt_autoremove_pending > 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}."
|
||||||
|
|
||||||
|
- alert: MailqNotEmpty
|
||||||
|
expr: postfix_mailq_length > 25
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}."
|
||||||
|
|
||||||
|
- alert: NoRadiusLogin
|
||||||
|
expr: rate(radiusd_access_ok[3m]) == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Personne ne vient taper le RADIUS."
|
||||||
|
|
||||||
|
- alert: TooManyReallocatedSectors
|
||||||
|
expr: smartmon_reallocated_sector_ct_raw_value > 1e3
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués."
|
||||||
|
|
||||||
|
{% endraw %}
|
42
roles/prometheus/templates/prometheus/prometheus.yml.j2
Normal file
42
roles/prometheus/templates/prometheus/prometheus.yml.j2
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
{{ ansible_header | comment }}
|
||||||
|
|
||||||
|
global:
|
||||||
|
# scrape_interval is set to the global default (60s)
|
||||||
|
# evaluation_interval is set to the global default (60s)
|
||||||
|
# scrape_timeout is set to the global default (10s).
|
||||||
|
|
||||||
|
# Attach these labels to any time series or alerts when communicating with
|
||||||
|
# external systems (federation, remote storage, Alertmanager).
|
||||||
|
external_labels:
|
||||||
|
monitor: 'example'
|
||||||
|
|
||||||
|
# Alertmanager configuration
|
||||||
|
# Use prometheus alertmanager installed on the same machine
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets: ['localhost:9093']
|
||||||
|
|
||||||
|
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||||
|
rule_files:
|
||||||
|
- "alert.rules.yml" # Monitoring alerts, this is the file you may be searching!
|
||||||
|
|
||||||
|
# A scrape configuration containing exactly one endpoint to scrape:
|
||||||
|
# Here it's Prometheus itself.
|
||||||
|
{{
|
||||||
|
{
|
||||||
|
"scrape_configs":
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"job_name": "prometheus",
|
||||||
|
"static_configs" : [
|
||||||
|
{
|
||||||
|
"targets": [
|
||||||
|
"localhost:9090"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
] + (prometheus | json_query("*.config[0]"))
|
||||||
|
} | to_nice_yaml(indent=2)
|
||||||
|
}}
|
3
roles/prometheus/templates/update-motd.d/05-service.j2
Executable file
3
roles/prometheus/templates/update-motd.d/05-service.j2
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/usr/bin/tail +14
|
||||||
|
{{ ansible_header | comment }}
|
||||||
|
[0m> [38;5;82mprometheus[0m a été déployé sur cette machine. Voir [38;5;6m/etc/prometheus/[0m.
|
Loading…
Reference in New Issue
Block a user