Alertmanager 钉钉告警
安装alertmanager dingtalk
# 官网下载alertmanager
https://github.com/prometheus/alertmanager/releases/alertmanager-0.22.0.linux-amd64.tar.gz
# 下载钉钉所需的补丁
https://github.com/timonwong/prometheus-webhook-dingtalk/releases
# 解压
tar zxf alertmanager-0.22.0.linux-amd64.tar.gz
tar zxf prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
# 移动到本地软件目录
mv alertmanager-0.22.0 /usr/local/alertmanager
mv prometheus-webhook-dingtalk-1.4.0.linux-amd64 /usr/local/alertmanager/prometheus-webhook-dingtalk
修改alertmanager配置
vim alertmanager.yml
# 定义模板核心
global:
resolve_timeout: 5m #处理超时时间,默认5min
# 定义路由树信息
route:
group_by: ['alertname'] #报警分组依据
group_wait: 20s #最初第一次等待多久时间发送警报通知
group_interval: 40s #在发送新警报前的等待时间
repeat_interval: 20m #重复报警的时间间隔(2h) 对与email配置中,不可过低
receiver: 'ops_dingding' #设置默认接受人
# 定义告警接收者
receivers:
- name: 'ops_dingding'
webhook_configs:
- url: http://150.158.39.86:8060/dingtalk/ops_dingding/send
send_resolved: true #警报被解决之后是否通知
# 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下,使匹配一组匹配器的警报失效的规则。两个警报必须具有一组相同的标签。
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
监控告警规则-服务器资源
groups:
- name: 实例存活告警规则
rules:
- alert: 实例存活告警 # 告警名称
expr: up == 0 # 告警的判定条件,参考Prometheus高级查询来设定
for: 15s # 满足告警条件持续时间多久后,才会发送告警
labels: #标签项
severity: serious
annotations: # 解析项,详细解释告警信息
title: "实例告警"
summary: "节点已停止运行超过15s!"
description: "检测到服务异常停止!"
- name: 内存告警规则
rules:
- alert: 内存使用率告警
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 95 #告警阈值为当内存使用率大于85%
for: 30s
labels:
severity: warning
annotations:
title: "内存告警"
summary: "服务器 内存报警"
#description: "内存资源利用率大于85%! (当前值:{{ $value }}%)"
description: "内存资源利用率大于95%! (当前值:{{ $value }}%)"
- name: 磁盘告警规则
rules:
- alert: 磁盘使用率告警
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 90 #告警阈值为某个挂载点使用大于90%
for: 1m
labels:
severity: warning
annotations:
title: "磁盘告警"
summary: "服务器 磁盘报警"
description: "服务器磁盘使用大于85%! (挂载点:{{ $labels.mountpoint }} 当前值:{{ $value }}%)"
- name: cpu使用率告警规则
rules:
- alert: cpu使用告警
expr: 100 - avg(irate(node_cpu_seconds_total{job="centos1",mode="idle"}[3m]))by(instance) * 100 > 80
for: 20s
labels:
severity: serious
annotations:
title: "CPU告警"
summary: '服务器 cpu报警'
description: '服务器cpu使用率大于80%!(节点:{{ $labels.instance }})'
监控告警规则-容器
groups:
- name: 容器存活告警规则
rules:
- alert: "kafka存活告警"
expr: absent(container_last_seen{name="kafka"}) == 1
for: 10s
labels:
severity: serious
annotations:
title: '容器告警'
summary: 'kafka容器告警'
description: 'kafka容器Exited了!'
- alert: "zookeeper存活告警"
expr: absent(container_last_seen{name="zookeeper"}) == 1
for: 10s
labels:
severity: serious
annotations:
title: '容器告警'
summary: 'zookeeper容器告警'
description: 'zookeeper容器Exited了!'
- alert: "promtail存活告警"
expr: absent(container_last_seen{name="promtail"}) == 1
for: 10s
labels:
severity: serious
annotations:
title: '容器告警'
summary: 'promtail容器告警'
description: 'promtail容器Exited了!'
- alert: "loki存活告警"
expr: absent(container_last_seen{name="loki"}) == 1
for: 10s
labels:
severity: serious
annotations:
title: '容器告警'
summary: 'loki容器告警'
description: 'loki容器Exited了!'
- alert: "emqx_test存活告警"
expr: absent(container_last_seen{name="emqx_test"}) == 1
for: 10s
labels:
severity: serious
annotations:
title: '容器告警'
summary: 'emqx_test容器告警'
description: 'emqx_test容器Exited了!'
dingtalk配置修改
vim config.yml
## Request timeout
# timeout: 5s
## Customizable templates path
templates:
- '/usr/local/prometheus/alertmanager/prometheus-webhook-dingtalk/default.tmpl'
## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
# default_message:
# title: '{{ template "legacy.title" . }}'
# text: '{{ template "legacy.content" . }}'
## Targets, previously was known as "profiles"
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=775ef5a2860039e6bb07f2ca6d68c9ff5392dbca72e69a6e0f73d269e9151402
# secret for signature
secret: SEC76cb00645be36d9a8c9487aa444fb6a4a75633b84e51f62a4a468fea2eb4c361
webhook2:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
webhook_legacy:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
# Customize template content
message:
# Use legacy template
title: '{{ template "legacy.title" . }}'
text: '{{ template "legacy.content" . }}'
webhook_mention_all:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
mention:
all: true
webhook_mention_users:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
mention:
mobiles: ['156xxxx8827', '189xxxx8325']
定义钉钉告警模版
cd /usr/local/alertmanager/prometheus-webhook-dingtalk
vim default.tmpl
{{ define "__subject" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ end }}
{{ define "__alert_list" }}{{ range . }}
---
{{ if .Labels.owner }}@{{ .Labels.owner }}{{ end }}
**告警名称**: {{ index .Annotations "title" }}
**告警级别**: {{ .Labels.severity }}
**告警主机**: {{ .Labels.instance }}
**告警服务**: {{ .Labels.job }}
**告警主题**: {{ .Annotations.summary }}
**告警信息**: {{ index .Annotations "description" }}
**告警时间**: {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
{{ end }}{{ end }}
{{ define "__resolved_list" }}{{ range . }}
---
{{ if .Labels.owner }}@{{ .Labels.owner }}{{ end }}
**告警名称**: {{ index .Annotations "title" }}
**告警级别**: {{ .Labels.severity }}
**告警主机**: {{ .Labels.instance }}
**告警主题**: {{ .Annotations.summary }}
**告警服务**: {{ .Labels.job }}
**告警信息**: {{ index .Annotations "description" }}
**告警时间**: {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
**恢复时间**: {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }}
{{ end }}{{ end }}
{{ define "default.title" }}
{{ template "__subject" . }}
{{ end }}
{{ define "default.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
**====侦测到{{ .Alerts.Firing | len }}个故障====**
{{ template "__alert_list" .Alerts.Firing }}
---
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
**====恢复{{ .Alerts.Resolved | len }}个故障====**
{{ template "__resolved_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
{{ define "ding.link.title" }}{{ template "default.title" . }}{{ end }}
{{ define "ding.link.content" }}{{ template "default.content" . }}{{ end }}
{{ template "default.title" . }}
{{ template "default.content" . }}
启动alertmanager webhook-dingtalk
./alertmanager --config.file=alertmanager.yml &
./prometheus-webhook-dingtalk --config.file=config.yml --web.listen-address=":8061" > webhook.log 2>&1 &