Alerts

Abnormal prod busy android count (0 active)
alert: Abnormal
  prod busy android count
expr: sum(android_states_by_cr{kubernetes_namespace="prod",status="busy"})
  < 0
for: 3m
labels:
  app: android
  severity: warning
  type: application
annotations:
  description: 如连续报警请检查 prepay instances
  summary: prod busy android 数量异常
Apiserver is down? (0 active)
alert: Apiserver
  is down?
expr: absent(apiserver_request_total)
  == 1
for: 3m
labels:
  app: android
  severity: warning
  type: application
annotations:
  description: apiserver may be unhealthy
  summary: no apiserver_request_total metrics
Evicted Pod (0 active)
alert: Evicted
  Pod
expr: kube_pod_container_status_terminated_reason{reason="Evicted"}
  == 1
for: 5m
labels:
  severity: warning
  type: source
annotations:
  description: 发生节点被驱逐状况,instance:{{$labels.instance}},namespace:{{$labels.namespace}},pod:{{$labels.pod}}
  summary: 发生节点被驱逐状况
HighRateOfCpuUsed (0 active)
alert: HighRateOfCpuUsed
expr: 100
  * (1 - sum by(instance) (increase(node_cpu_seconds_total{mode="idle"}[1h]))
  / sum by(instance) (increase(node_cpu_seconds_total[1h]))) > 95
for: 5m
labels:
  severity: warning
  type: source
annotations:
  description: 1小时中cpu使用超过95%,instance:{{$labels.instance}}
  summary: 1小时中cpu使用超过95%
HighRateOfDiskUsed (0 active)
alert: HighRateOfDiskUsed
expr: (node_filesystem_size_bytes{mountpoint="/rootfs"}
  - node_filesystem_avail_bytes{mountpoint="/rootfs"}) / node_filesystem_size_bytes{mountpoint="/rootfs"}
  * 100 > 85
for: 5m
labels:
  severity: warning
  type: source
annotations:
  description: 硬盘占用率超过限制85%,instance:{{$labels.instance}},device:{{$labels.device}},fstype:{{$labels.fstype}}
  summary: 硬盘占用率超过限制
HighRateOfMemoryUsed-Hour (0 active)
alert: HighRateOfMemoryUsed-Hour
expr: 100
  * (1 - ((avg_over_time(node_memory_MemFree_bytes[1h]) + avg_over_time(node_memory_Cached_bytes[1h])
  + avg_over_time(node_memory_Buffers_bytes[1h])) / avg_over_time(node_memory_MemTotal_bytes[1h])))
  > 90
for: 5m
labels:
  severity: warning
  type: source
annotations:
  description: 1小时中内存使用超过90%,instance:{{$labels.instance}},namespace:{{$labels.kubernetes_pod_name}},pod:{{$labels.name}}
  summary: 1小时中内存使用超过90%
HighRateOfMemoryUsed-Minute (0 active)
alert: HighRateOfMemoryUsed-Minute
expr: 100
  * (1 - avg_over_time(node_memory_MemAvailable_bytes[1m]) / avg_over_time(node_memory_MemTotal_bytes[1m]))
  > 90
for: 1m
labels:
  severity: warning
  type: source
annotations:
  description: 1分钟中内存使用超过90%,instance:{{$labels.instance}},namespace:{{$labels.kubernetes_pod_name}},pod:{{$labels.name}}
  summary: 1分钟中内存使用超过90%
No Available Coordinator (0 active)
alert: No
  Available Coordinator
expr: sum
  by(appName, kubernetes_namespace) (game_busy_count{source="no avail coordinator",trigger="user"})
  - sum by(appName, kubernetes_namespace) (game_busy_count{source="no avail coordinator",trigger="user"}
  offset 5m) > 0
for: 5m
labels:
  app: android
  severity: warning
  type: application
annotations:
  description: '{{ $labels.kubernetes_namespace }} {{ $labels.appName }} 5分钟内出现了 no
    avail coordinator, 当前值 {{ $value }}'
  summary: no avail coordinator 的次数
Prepay instance not ready (0 active)
alert: Prepay
  instance not ready
expr: not_ready_count
  - (not_ready_count offset 1m) > 0
for: 3m
labels:
  app: android
  severity: warning
  type: application
annotations:
  description: 请检查 prepay instances in cluster {{ $labels.clusterName}}
  summary: prepay instance in not ready state
Server Busy (0 active)
alert: Server
  Busy
expr: sum
  by(appName) (join_queue_count) - sum by(appName) (join_queue_count offset 1m) >
  10
for: 30s
labels:
  app: android
  severity: warning
  type: application
annotations:
  description: '{{ $labels.appName }} 1分钟内server busy的次数已经超过了10次, 当前值 {{ $value }}'
  summary: server busy的次数