Alertmanager email告警
安装alertmanager
# 官网下载alertmanager
https://github.com/prometheus/alertmanager/releases/alertmanager-0.22.0.linux-amd64.tar.gz
# 解压
tar zxf alertmanager-0.22.0.linux-amd64.tar.gz
# 移动到本地软件目录
mv alertmanager-0.22.0 /usr/local/alertmanager
修改alertmanager配置
# https://blog.csdn.net/weixin_45880055/article/details/120585024
$ vim alertmanager.yml
# 定义模板核心
global:
resolve_timeout: 5m #处理超时时间,默认5min
smtp_smarthost: 'smtp.qq.com:25' #邮箱smtp服务器代理
smtp_from: '' #发送邮箱名称
smtp_auth_username: '' #邮箱名称
smtp_auth_password: '' #邮箱授权码
# 定义模板信息
templates:
- '/usr/local/alertmanager/template/test.tmpl'
# 定义路由树信息
route:
group_by: ['alertname'] #报警分组依据
group_wait: 10s #最初第一次等待多久时间发送警报通知
group_interval: 10s #在发送新警报前的等待时间
repeat_interval: 5m #在重复警报周期 对与email配置中,不可过低
receiver: 'email' #发送警报的接收者名称,以下receivers name名称
# 定义警报接收者信息
receivers:
- name: 'email' #警报
email_configs: #邮箱配置
- to: '' #接受警报的Email配置,逗号分割添加邮箱
html: '{{ template "email.to.html" . }}'
headers: { Subject: "[WARN] prometheus报警邮件" }
send_resolved: true #恢复故障后通知
# 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下,使匹配一组匹配器的警报失效的规则。两个警报必须具有一组相同的标签。
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
# 在Prometheus目录下,创建一个rules目录
]# cd /usr/local/prometheus && mkdir rules
定义prometheus告警规则
]# cd rules/ && vim rules.yml
groups:
- name: 实例存活告警规则
rules:
- alert: 实例存活告警 # 告警规则名称
expr: up == 0 # expr是计算公式,
for: 15s # 满足告警条件持续时间多久后,才会发送告警
labels:
severity: serious
annotations: # 解析项,详细解释告警信息
summary: "节点已停止运行超过15s!"
description: "检测到异常停止!请重点关注!!!"
- name: 内存告警规则
rules:
- alert: "内存使用率告警"
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 85 #告警阈值>为当内存使用率大于85%
for: 30s
labels:
severity: warning
annotations:
summary: "服务器 内存报警"
description: "内存资源利用率大于85%! (当前值:{{ $value }}%)"
- name: 磁盘告警规则
rules:
- alert: 磁盘使用率告警
expr: (node_filesystem_size_bytes - node_filesystem_avail_butes) / node_filesystem_size_bytes * 100 > 90 #告警阈值为某个挂载点使用大于90%
for: 1m
labels:
severity: warning
annotations:
summary: "服务器 磁盘报警"
description: "服务器磁盘使用大于85%! (挂载点:{{ $labels.mountpoint }} 当前值:{{ $value }}%)"
- name: cpu使用率告警规则
rules:
- alert: "cpu使用告警"
expr: 100 - avg(irate(node_cpu_seconds_total{job="centos1",mode="idle"}[3m]))by(instance) * 100 > 80
for: 20s
labels:
severity: serious
annotations:
summary: '服务器 cpu报警'
description: '服务器cpu使用率大于80%!(节点:{{ $labels.instance }})'
- name: 容器存活告警规则
rules:
- alert: "promtail存活告警"
expr: absent(container_last_seen{name="promtail"}) == 1
for: 10s
labels:
severity: serious
annotations:
summary: 'promtail容器告警'
description: 'promtail容器异常停止超过20s!'
修改prometheus配置文件
$ vim prometheus.yml
……
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.8.204:9093
rule_files:
- "/usr/local/prometheus/rules/*.yml"
……
# 创建模板位置
]# cd /usr/local/alertmanager/ && mkdir template
配置告警通知模版
$ vim template/test.tmpl
{{ define "email.to.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{ range .Alerts }}
<h2>@告警通知</h2>
==========start==========<br>
告警程序:prometheus_alert <br>
告警级别:{{ .Labels.severity }} 级 <br>
告警类型:{{ .Labels.alertname }} <br>
故障主机:{{ .Labels.instance }} <br>
告警主题:{{ .Annotations.summary }} <br>
告警详情:{{ .Annotations.description }} <br>
触发时间:{{ .StartsAt.Local.Format "2006-01-02 15:04:05" }} <br>
==========end==========<br>
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{ range .Alerts }}
<h2>@恢复通知</h2>
==========start==========<br>
告警程序:prometheus_alert <br>
告警主机:{{ .Labels.instance }} <br>
告警主题:{{ .Annotations.summary }} <br>
告警详情:{{ .Annotations.description }} <br>
告警时间:{{ .StartsAt.Local.Format "2006-01-02 15:04:05" }} <br>
恢复时间:{{ .EndsAt.Local.Format "2006-01-02 15:04:05" }} <br>
==========end==========<br>
{{ end }}{{ end -}}
{{- end }}
启动alertmanager
./alertmanager --config.file=alertmanager.yml &