Alertmanager 钉钉告警

安装alertmanager dingtalk

# 官网下载alertmanager
https://github.com/prometheus/alertmanager/releases/alertmanager-0.22.0.linux-amd64.tar.gz

# 下载钉钉所需的补丁
https://github.com/timonwong/prometheus-webhook-dingtalk/releases

# 解压
tar zxf alertmanager-0.22.0.linux-amd64.tar.gz
tar zxf prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz

# 移动到本地软件目录
mv alertmanager-0.22.0  /usr/local/alertmanager
mv prometheus-webhook-dingtalk-1.4.0.linux-amd64 /usr/local/alertmanager/prometheus-webhook-dingtalk

修改alertmanager配置

vim alertmanager.yml 
# 定义模板核心
global:
  resolve_timeout: 5m  #处理超时时间,默认5min
  
# 定义路由树信息
route:
  group_by: ['alertname']  #报警分组依据
  group_wait: 20s  #最初第一次等待多久时间发送警报通知
  group_interval: 40s   #在发送新警报前的等待时间
  repeat_interval: 20m   #重复报警的时间间隔(2h) 对与email配置中,不可过低
  receiver: 'ops_dingding'  #设置默认接受人

# 定义告警接收者
receivers:
- name: 'ops_dingding'
  webhook_configs: 
  - url: http://150.158.39.86:8060/dingtalk/ops_dingding/send
    send_resolved: true  #警报被解决之后是否通知

# 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下,使匹配一组匹配器的警报失效的规则。两个警报必须具有一组相同的标签。 
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

监控告警规则-服务器资源

groups:
 - name: 实例存活告警规则
   rules:
   - alert: 实例存活告警  # 告警名称
     expr: up == 0 # 告警的判定条件,参考Prometheus高级查询来设定
     for: 15s # 满足告警条件持续时间多久后,才会发送告警
     labels: #标签项
       severity: serious
     annotations: # 解析项,详细解释告警信息
       title: "实例告警"
       summary: "节点已停止运行超过15s!"
       description: "检测到服务异常停止!"
 - name: 内存告警规则
   rules:
   - alert: 内存使用率告警
     expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 95 #告警阈值为当内存使用率大于85%
     for: 30s
     labels:
       severity: warning
     annotations:
       title: "内存告警"
       summary: "服务器 内存报警"
       #description: "内存资源利用率大于85%! (当前值:{{ $value }}%)"
       description: "内存资源利用率大于95%! (当前值:{{ $value }}%)"
 - name: 磁盘告警规则
   rules:
   - alert: 磁盘使用率告警
     expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 90 #告警阈值为某个挂载点使用大于90%
     for: 1m
     labels:
       severity: warning
     annotations:
       title: "磁盘告警"
       summary: "服务器 磁盘报警"
       description: "服务器磁盘使用大于85%! (挂载点:{{ $labels.mountpoint }} 当前值:{{ $value }}%)"
 - name: cpu使用率告警规则
   rules:
   - alert: cpu使用告警
     expr: 100 - avg(irate(node_cpu_seconds_total{job="centos1",mode="idle"}[3m]))by(instance) * 100 > 80
     for: 20s
     labels:
       severity: serious
     annotations:
       title: "CPU告警"
       summary: '服务器 cpu报警'
       description:  '服务器cpu使用率大于80%!(节点:{{ $labels.instance }})'

监控告警规则-容器

groups:
 - name: 容器存活告警规则
   rules:
   - alert: "kafka存活告警"
     expr: absent(container_last_seen{name="kafka"}) == 1
     for: 10s
     labels:
       severity: serious
     annotations:
       title: '容器告警'
       summary: 'kafka容器告警'
       description:  'kafka容器Exited了!'

   - alert: "zookeeper存活告警"
     expr: absent(container_last_seen{name="zookeeper"}) == 1
     for: 10s
     labels:
       severity: serious
     annotations:
       title: '容器告警'
       summary: 'zookeeper容器告警'
       description:  'zookeeper容器Exited了!'

   - alert: "promtail存活告警"
     expr: absent(container_last_seen{name="promtail"}) == 1
     for: 10s
     labels:
       severity: serious
     annotations:
       title: '容器告警'
       summary: 'promtail容器告警'
       description:  'promtail容器Exited了!'

   - alert: "loki存活告警"
     expr: absent(container_last_seen{name="loki"}) == 1
     for: 10s
     labels:
       severity: serious
     annotations:
       title: '容器告警'
       summary: 'loki容器告警'
       description:  'loki容器Exited了!'

   - alert: "emqx_test存活告警"
     expr: absent(container_last_seen{name="emqx_test"}) == 1
     for: 10s
     labels:
       severity: serious
     annotations:
       title: '容器告警'
       summary: 'emqx_test容器告警'
       description:  'emqx_test容器Exited了!'

dingtalk配置修改

vim config.yml
## Request timeout
# timeout: 5s

## Customizable templates path
templates:
  - '/usr/local/prometheus/alertmanager/prometheus-webhook-dingtalk/default.tmpl'

## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
# default_message:
#   title: '{{ template "legacy.title" . }}'
#   text: '{{ template "legacy.content" . }}'

## Targets, previously was known as "profiles"
targets:
  webhook1:
    url: https://oapi.dingtalk.com/robot/send?access_token=775ef5a2860039e6bb07f2ca6d68c9ff5392dbca72e69a6e0f73d269e9151402
    # secret for signature
    secret: SEC76cb00645be36d9a8c9487aa444fb6a4a75633b84e51f62a4a468fea2eb4c361
  webhook2:
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
  webhook_legacy:
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
    # Customize template content
    message:
      # Use legacy template
      title: '{{ template "legacy.title" . }}'
      text: '{{ template "legacy.content" . }}'
  webhook_mention_all:
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
    mention:
      all: true
  webhook_mention_users:
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
    mention:
      mobiles: ['156xxxx8827', '189xxxx8325']

定义钉钉告警模版

cd /usr/local/alertmanager/prometheus-webhook-dingtalk
vim default.tmpl
{{ define "__subject" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ end }}

{{ define "__alert_list" }}{{ range . }}
---
{{ if .Labels.owner }}@{{ .Labels.owner }}{{ end }}

**告警名称**: {{ index .Annotations "title" }}

**告警级别**: {{ .Labels.severity }}

**告警主机**: {{ .Labels.instance }}

**告警服务**: {{ .Labels.job }}

**告警主题**: {{ .Annotations.summary }}

**告警信息**: {{ index .Annotations "description" }}

**告警时间**: {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
{{ end }}{{ end }}

{{ define "__resolved_list" }}{{ range . }}
---
{{ if .Labels.owner }}@{{ .Labels.owner }}{{ end }}

**告警名称**: {{ index .Annotations "title" }}

**告警级别**: {{ .Labels.severity }}

**告警主机**: {{ .Labels.instance }}

**告警主题**: {{ .Annotations.summary }}

**告警服务**: {{ .Labels.job }}

**告警信息**: {{ index .Annotations "description" }}

**告警时间**: {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}

**恢复时间**: {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }}
{{ end }}{{ end }}

{{ define "default.title" }}
{{ template "__subject" . }}
{{ end }}

{{ define "default.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
**====侦测到{{ .Alerts.Firing | len  }}个故障====**
{{ template "__alert_list" .Alerts.Firing }}
---
{{ end }}

{{ if gt (len .Alerts.Resolved) 0 }}
**====恢复{{ .Alerts.Resolved | len  }}个故障====**
{{ template "__resolved_list" .Alerts.Resolved }}
{{ end }}
{{ end }}


{{ define "ding.link.title" }}{{ template "default.title" . }}{{ end }}
{{ define "ding.link.content" }}{{ template "default.content" . }}{{ end }}
{{ template "default.title" . }}
{{ template "default.content" . }}

启动alertmanager webhook-dingtalk

./alertmanager --config.file=alertmanager.yml  & 

./prometheus-webhook-dingtalk --config.file=config.yml --web.listen-address=":8061" > webhook.log 2>&1 &