Alertmanager email告警

安装alertmanager

# 官网下载alertmanager
https://github.com/prometheus/alertmanager/releases/alertmanager-0.22.0.linux-amd64.tar.gz

# 解压
tar zxf alertmanager-0.22.0.linux-amd64.tar.gz

# 移动到本地软件目录
mv alertmanager-0.22.0  /usr/local/alertmanager

修改alertmanager配置

# https://blog.csdn.net/weixin_45880055/article/details/120585024

$ vim alertmanager.yml 
# 定义模板核心
global:
  resolve_timeout: 5m  #处理超时时间,默认5min
  smtp_smarthost: 'smtp.qq.com:25'  #邮箱smtp服务器代理
  smtp_from: ''  #发送邮箱名称
  smtp_auth_username: ''  #邮箱名称
  smtp_auth_password: ''  #邮箱授权码
# 定义模板信息
templates:
  - '/usr/local/alertmanager/template/test.tmpl'
# 定义路由树信息
route:
  group_by: ['alertname']  #报警分组依据
  group_wait: 10s  #最初第一次等待多久时间发送警报通知
  group_interval: 10s   #在发送新警报前的等待时间
  repeat_interval: 5m   #在重复警报周期 对与email配置中,不可过低
  receiver: 'email'  #发送警报的接收者名称,以下receivers name名称
# 定义警报接收者信息
receivers:
- name: 'email'  #警报
  email_configs:  #邮箱配置
  - to: ''  #接受警报的Email配置,逗号分割添加邮箱
    html: '{{ template "email.to.html" . }}' 
    headers: { Subject: "[WARN] prometheus报警邮件" }
    send_resolved: true  #恢复故障后通知
# 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下,使匹配一组匹配器的警报失效的规则。两个警报必须具有一组相同的标签。 
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

# 在Prometheus目录下,创建一个rules目录
]# cd /usr/local/prometheus && mkdir rules

定义prometheus告警规则

]# cd rules/ && vim rules.yml
groups:
 - name: 实例存活告警规则
   rules:
   - alert: 实例存活告警  # 告警规则名称
     expr: up == 0 # expr是计算公式,
     for: 15s # 满足告警条件持续时间多久后,才会发送告警
     labels:
       severity: serious
     annotations: # 解析项,详细解释告警信息
       summary: "节点已停止运行超过15s!"
       description: "检测到异常停止!请重点关注!!!"
 - name: 内存告警规则
   rules:
   - alert: "内存使用率告警"     
  expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 85 #告警阈值>为当内存使用率大于85%
     for: 30s
     labels:
       severity: warning
     annotations:
       summary: "服务器 内存报警"
       description: "内存资源利用率大于85%! (当前值:{{ $value }}%)"
 - name: 磁盘告警规则
   rules:
   - alert: 磁盘使用率告警
     expr: (node_filesystem_size_bytes - node_filesystem_avail_butes) / node_filesystem_size_bytes * 100 > 90 #告警阈值为某个挂载点使用大于90%
     for: 1m
     labels:
       severity: warning
     annotations:
       summary: "服务器 磁盘报警"
       description: "服务器磁盘使用大于85%! (挂载点:{{ $labels.mountpoint }} 当前值:{{ $value }}%)"
 - name: cpu使用率告警规则
   rules:
   - alert: "cpu使用告警"
     expr: 100 - avg(irate(node_cpu_seconds_total{job="centos1",mode="idle"}[3m]))by(instance) * 100 > 80
     for: 20s
     labels:
       severity: serious
     annotations:
       summary: '服务器 cpu报警'
       description:  '服务器cpu使用率大于80%!(节点:{{ $labels.instance }})'

 - name: 容器存活告警规则
   rules:
   - alert: "promtail存活告警"
     expr: absent(container_last_seen{name="promtail"}) == 1
     for: 10s
     labels:
       severity: serious
     annotations:
       summary: 'promtail容器告警'
       description:  'promtail容器异常停止超过20s!'

修改prometheus配置文件

$ vim prometheus.yml
……
alerting:
  alertmanagers:
    - static_configs:
        - targets:
           - 192.168.8.204:9093
rule_files:
   - "/usr/local/prometheus/rules/*.yml"
……

# 创建模板位置
]# cd /usr/local/alertmanager/  && mkdir template 

配置告警通知模版

$ vim template/test.tmpl
{{ define "email.to.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{ range .Alerts }}
<h2>@告警通知</h2>
==========start==========<br>
告警程序:prometheus_alert <br>
告警级别:{{ .Labels.severity }} 级 <br>
告警类型:{{ .Labels.alertname }} <br>
故障主机:{{ .Labels.instance }} <br>
告警主题:{{ .Annotations.summary }} <br>
告警详情:{{ .Annotations.description }} <br>
触发时间:{{ .StartsAt.Local.Format "2006-01-02 15:04:05" }} <br>
==========end==========<br>
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{ range .Alerts }}
<h2>@恢复通知</h2>
==========start==========<br>
告警程序:prometheus_alert <br>
告警主机:{{ .Labels.instance }} <br>
告警主题:{{ .Annotations.summary }} <br>
告警详情:{{ .Annotations.description }} <br>
告警时间:{{ .StartsAt.Local.Format "2006-01-02 15:04:05" }} <br>
恢复时间:{{ .EndsAt.Local.Format "2006-01-02 15:04:05" }} <br>
==========end==========<br>
{{ end }}{{ end -}}
{{- end }}

启动alertmanager

./alertmanager --config.file=alertmanager.yml  &