5
roles/prometheus/handlers/main.yml
Normal file
5
roles/prometheus/handlers/main.yml
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
- name: Restart Prometheus
|
||||
service:
|
||||
name: prometheus
|
||||
state: restarted
|
42
roles/prometheus/tasks/main.yml
Normal file
42
roles/prometheus/tasks/main.yml
Normal file
@ -0,0 +1,42 @@
|
||||
---
|
||||
- name: Install Prometheus
|
||||
apt:
|
||||
update_cache: true
|
||||
name: prometheus
|
||||
register: apt_result
|
||||
retries: 3
|
||||
until: apt_result is succeeded
|
||||
|
||||
- name: Configure Prometheus
|
||||
template:
|
||||
src: prometheus/prometheus.yml.j2
|
||||
dest: /etc/prometheus/prometheus.yml
|
||||
mode: 0644
|
||||
notify: Restart Prometheus
|
||||
|
||||
- name: Configure Prometheus alert rules
|
||||
template:
|
||||
src: prometheus/alert.rules.yml.j2
|
||||
dest: /etc/prometheus/alert.rules.yml
|
||||
mode: 0644
|
||||
notify: Restart Prometheus
|
||||
|
||||
# We don't need to restart Prometheus when updating nodes
|
||||
- name: Configure Prometheus targets
|
||||
copy:
|
||||
content: "{{ [{'targets': item.value.targets}] | to_nice_json }}\n"
|
||||
dest: "/etc/prometheus/{{ item.value.file }}"
|
||||
mode: 0644
|
||||
loop: "{{ prometheus | dict2items }}"
|
||||
|
||||
- name: Activate prometheus service
|
||||
systemd:
|
||||
name: prometheus
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: Indicate role in motd
|
||||
template:
|
||||
src: update-motd.d/05-service.j2
|
||||
dest: /etc/update-motd.d/05-prometheus
|
||||
mode: 0755
|
187
roles/prometheus/templates/prometheus/alert.rules.yml.j2
Normal file
187
roles/prometheus/templates/prometheus/alert.rules.yml.j2
Normal file
@ -0,0 +1,187 @@
|
||||
{{ ansible_header | comment }}
|
||||
{# As this is also Jinja2 it will conflict without a raw block #}
|
||||
{# Depending of Prometheus Node exporter version, rules can change depending of version #}
|
||||
{% raw %}
|
||||
groups:
|
||||
- name: alert.rules
|
||||
rules:
|
||||
|
||||
# Alert for any instance that is unreachable for >3 minutes.
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ $labels.instance }} ({{ $labels.job }}) est invisible depuis plus de 3 minutes !"
|
||||
|
||||
# Alert for out of memory
|
||||
# Do not take into account memory not used by apps
|
||||
- alert: OutOfMemory
|
||||
expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Mémoire libre de {{ $labels.instance }} à {{ $value }}%."
|
||||
|
||||
# Alert for out of disk space
|
||||
- alert: OutOfDiskSpace
|
||||
expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value }}%."
|
||||
|
||||
# Alert for out of inode space on disk
|
||||
- alert: OutOfInodes
|
||||
expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}."
|
||||
|
||||
# Alert for high CPU usage
|
||||
- alert: CpuBusy
|
||||
expr: node_load5 > 9
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Charge sur {{ $labels.instance }} à {{ $value }}."
|
||||
|
||||
# Check mdadm software RAID
|
||||
- alert: SoftwareRAIDDegraded
|
||||
expr: node_md_disks-node_md_disks_active > 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)."
|
||||
|
||||
# Check systemd unit (> buster)
|
||||
- alert: SystemdServiceFailed
|
||||
expr: node_systemd_unit_state{state="failed"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
|
||||
|
||||
# Check UPS
|
||||
- alert: UpsOutputSourceChanged
|
||||
expr: upsOutputSource != 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "La source d'alimentation de {{ $labels.instance }} a changé !"
|
||||
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||
|
||||
- alert: UpsBatteryStatusChanged
|
||||
expr: upsBatteryStatus != 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "L'état de la batterie de {{ $labels.instance }} a changé !"
|
||||
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||
|
||||
- alert: UpsTemperatureWarning
|
||||
expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C."
|
||||
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||
|
||||
- alert: UpsTemperatureCritical
|
||||
expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C !"
|
||||
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||
|
||||
- alert: UpsHighHumidity
|
||||
expr: xupsEnvRemoteHumidity > 65
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}%."
|
||||
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||
|
||||
- alert: UpsVeryHighHumidity
|
||||
expr: xupsEnvRemoteHumidity > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}% !"
|
||||
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||
|
||||
- alert: UpsHighLoad
|
||||
expr: upsOutputPercentLoad > 70
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !"
|
||||
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||
|
||||
- alert: UpsWrongInputVoltage
|
||||
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V."
|
||||
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||
|
||||
- alert: UpsWrongOutputVoltage
|
||||
expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V."
|
||||
description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
|
||||
|
||||
- alert: AptAutoremovePending
|
||||
expr: apt_autoremove_pending > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}."
|
||||
|
||||
- alert: MailqNotEmpty
|
||||
expr: postfix_mailq_length > 25
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}."
|
||||
|
||||
- alert: NoRadiusLogin
|
||||
expr: rate(radiusd_access_ok[3m]) == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Personne ne vient taper le RADIUS."
|
||||
|
||||
- alert: TooManyReallocatedSectors
|
||||
expr: smartmon_reallocated_sector_ct_raw_value > 1e3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués."
|
||||
|
||||
{% endraw %}
|
42
roles/prometheus/templates/prometheus/prometheus.yml.j2
Normal file
42
roles/prometheus/templates/prometheus/prometheus.yml.j2
Normal file
@ -0,0 +1,42 @@
|
||||
{{ ansible_header | comment }}
|
||||
|
||||
global:
|
||||
# scrape_interval is set to the global default (60s)
|
||||
# evaluation_interval is set to the global default (60s)
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
|
||||
# Attach these labels to any time series or alerts when communicating with
|
||||
# external systems (federation, remote storage, Alertmanager).
|
||||
external_labels:
|
||||
monitor: 'example'
|
||||
|
||||
# Alertmanager configuration
|
||||
# Use prometheus alertmanager installed on the same machine
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['localhost:9093']
|
||||
|
||||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||
rule_files:
|
||||
- "alert.rules.yml" # Monitoring alerts, this is the file you may be searching!
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape:
|
||||
# Here it's Prometheus itself.
|
||||
{{
|
||||
{
|
||||
"scrape_configs":
|
||||
[
|
||||
{
|
||||
"job_name": "prometheus",
|
||||
"static_configs" : [
|
||||
{
|
||||
"targets": [
|
||||
"localhost:9090"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
] + (prometheus | json_query("*.config[0]"))
|
||||
} | to_nice_yaml(indent=2)
|
||||
}}
|
3
roles/prometheus/templates/update-motd.d/05-service.j2
Executable file
3
roles/prometheus/templates/update-motd.d/05-service.j2
Executable file
@ -0,0 +1,3 @@
|
||||
#!/usr/bin/tail +14
|
||||
{{ ansible_header | comment }}
|
||||
[0m> [38;5;82mprometheus[0m a été déployé sur cette machine. Voir [38;5;6m/etc/prometheus/[0m.
|
Reference in New Issue
Block a user