Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: Server
Busy
expr: sum
by(appName) (join_queue_count) - sum by(appName) (join_queue_count offset 1m) >
10
for: 30s
labels:
app: android
severity: warning
type: application
annotations:
description: '{{ $labels.appName }} 1分钟内server busy的次数已经超过了10次, 当前值 {{ $value }}'
summary: server busy的次数
|
ok
|
|
9.13s ago
|
4.096ms |
alert: No
Available Coordinator
expr: sum
by(appName, kubernetes_namespace) (game_busy_count{source="no avail coordinator",trigger="user"})
- sum by(appName, kubernetes_namespace) (game_busy_count{source="no avail coordinator",trigger="user"}
offset 5m) > 0
for: 5m
labels:
app: android
severity: warning
type: application
annotations:
description: '{{ $labels.kubernetes_namespace }} {{ $labels.appName }} 5分钟内出现了 no
avail coordinator, 当前值 {{ $value }}'
summary: no avail coordinator 的次数
|
ok
|
|
9.126s ago
|
157.3us |
alert: Abnormal
prod busy android count
expr: sum(android_states_by_cr{kubernetes_namespace="prod",status="busy"})
< 0
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: 如连续报警请检查 prepay instances
summary: prod busy android 数量异常
|
ok
|
|
9.126s ago
|
721us |
alert: Prepay
instance not ready
expr: not_ready_count
- (not_ready_count offset 1m) > 0
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: 请检查 prepay instances in cluster {{ $labels.clusterName}}
summary: prepay instance in not ready state
|
ok
|
|
9.125s ago
|
65.64us |
alert: Apiserver
is down?
expr: absent(apiserver_request_total)
== 1
for: 3m
labels:
app: android
severity: warning
type: application
annotations:
description: apiserver may be unhealthy
summary: no apiserver_request_total metrics
|
ok
|
|
9.125s ago
|
7.399ms |
|
4.17s ago |
4.718ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: HighRateOfDiskUsed
expr: (node_filesystem_size_bytes{mountpoint="/rootfs"}
- node_filesystem_avail_bytes{mountpoint="/rootfs"}) / node_filesystem_size_bytes{mountpoint="/rootfs"}
* 100 > 85
for: 5m
labels:
severity: warning
type: source
annotations:
description: 硬盘占用率超过限制85%,instance:{{$labels.instance}},device:{{$labels.device}},fstype:{{$labels.fstype}}
summary: 硬盘占用率超过限制
|
ok
|
|
4.17s ago
|
387.2us |
alert: Evicted
Pod
expr: kube_pod_container_status_terminated_reason{reason="Evicted"}
== 1
for: 5m
labels:
severity: warning
type: source
annotations:
description: 发生节点被驱逐状况,instance:{{$labels.instance}},namespace:{{$labels.namespace}},pod:{{$labels.pod}}
summary: 发生节点被驱逐状况
|
ok
|
|
4.17s ago
|
53.48us |
alert: HighRateOfMemoryUsed-Hour
expr: 100
* (1 - ((avg_over_time(node_memory_MemFree_bytes[1h]) + avg_over_time(node_memory_Cached_bytes[1h])
+ avg_over_time(node_memory_Buffers_bytes[1h])) / avg_over_time(node_memory_MemTotal_bytes[1h])))
> 90
for: 5m
labels:
severity: warning
type: source
annotations:
description: 1小时中内存使用超过90%,instance:{{$labels.instance}},namespace:{{$labels.kubernetes_pod_name}},pod:{{$labels.name}}
summary: 1小时中内存使用超过90%
|
ok
|
|
4.17s ago
|
677.7us |
alert: HighRateOfMemoryUsed-Minute
expr: 100
* (1 - avg_over_time(node_memory_MemAvailable_bytes[1m]) / avg_over_time(node_memory_MemTotal_bytes[1m]))
> 90
for: 1m
labels:
severity: warning
type: source
annotations:
description: 1分钟中内存使用超过90%,instance:{{$labels.instance}},namespace:{{$labels.kubernetes_pod_name}},pod:{{$labels.name}}
summary: 1分钟中内存使用超过90%
|
ok
|
|
4.169s ago
|
237us |
alert: HighRateOfCpuUsed
expr: 100
* (1 - sum by(instance) (increase(node_cpu_seconds_total{mode="idle"}[1h]))
/ sum by(instance) (increase(node_cpu_seconds_total[1h]))) > 95
for: 5m
labels:
severity: warning
type: source
annotations:
description: 1小时中cpu使用超过95%,instance:{{$labels.instance}}
summary: 1小时中cpu使用超过95%
|
ok
|
|
4.169s ago
|
3.35ms |
|
6.393s ago |
297.6us |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
record: node:runtime:seconds
expr: label_join(time()
- node_boot_time_seconds, "computed", ",", "instance")
|
ok
|
|
643ms ago
|
244.5us |
record: node:cpu_count
expr: label_join(count
by(instance) (count by(cpu, instance) (node_cpu_seconds_total{mode="system"})),
"computed", ",", "instance")
|
ok
|
|
643ms ago
|
279.4us |
record: node:cpu_usage_ratio:1m
expr: label_join(1
- (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m]))), "computed",
",", "instance")
|
ok
|
|
643ms ago
|
292.4us |
record: node:memory_usage_ratio
expr: label_join(((node_memory_MemTotal_bytes
- node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes)
/ (node_memory_MemTotal_bytes)), "computed", ",", "instance")
|
ok
|
|
643ms ago
|
359.1us |
record: node:memory_usage_size
expr: label_join(node_memory_MemTotal_bytes
- (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes),
"computed", ",", "instance")
|
ok
|
|
642ms ago
|
257.8us |
record: node:cpu_usage_ratio_system:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="system"}[1m])), "computed",
",", "instance")
|
ok
|
|
642ms ago
|
290us |
record: node:cpu_usage_ratio_user:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="user"}[1m])), "computed",
",", "instance")
|
ok
|
|
642ms ago
|
299.5us |
record: node:cpu_usage_ratio_iowait:1m
expr: label_join(avg
by(instance) (irate(node_cpu_seconds_total{mode="iowait"}[1m])), "computed",
",", "instance")
|
ok
|
|
642ms ago
|
228us |
record: node:network_receive:5m
expr: label_join(irate(node_network_receive_bytes_total[5m])
* 8, "computed", ",", "instance")
|
ok
|
|
641ms ago
|
1.887ms |
record: node:network_transmit:5m
expr: label_join(irate(node_network_transmit_bytes_total[5m])
* 8, "computed", ",", "instance")
|
ok
|
|
640ms ago
|
1.818ms |
record: node:disk_read_iops:1m
expr: label_join(irate(node_disk_reads_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
638ms ago
|
75.35us |
record: node:disk_write_iops:1m
expr: label_join(irate(node_disk_writes_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
638ms ago
|
51.11us |
record: node:disk_read:1m
expr: label_join(irate(node_disk_read_bytes_total[1m]),
"computed", ",", "instance")
|
ok
|
|
638ms ago
|
42.37us |
record: node:disk_write:1m
expr: label_join(irate(node_disk_written_bytes_total[1m]),
"computed", ",", "instance")
|
ok
|
|
638ms ago
|
44.4us |
record: node:read_disk_io_time:1m
expr: label_join(irate(node_disk_read_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
638ms ago
|
40.1us |
record: node:write_disk_io_time:1m
expr: label_join(irate(node_disk_write_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
638ms ago
|
43.43us |
record: node:disk_io_time:1m
expr: label_join(irate(node_disk_io_time_seconds_total[1m]),
"computed", ",", "instance")
|
ok
|
|
638ms ago
|
39us |
record: node:tcp_activeopens:1m
expr: label_join(irate(node_netstat_Tcp_ActiveOpens[1m]),
"computed", ",", "instance")
|
ok
|
|
638ms ago
|
134.2us |
record: node:tcp_passiveopens:1m
expr: label_join(irate(node_netstat_Tcp_PassiveOpens[1m]),
"computed", ",", "instance")
|
ok
|
|
638ms ago
|
124.6us |
record: node:filesystem_usage_size
expr: label_join(node_filesystem_size_bytes{fstype=~"ext4|xfs"}
- node_filesystem_avail_bytes{fstype=~"ext4|xfs"}, "computed", ",",
"instance")
|
ok
|
|
638ms ago
|
807.4us |
record: node:filesystem_usage_ratio
expr: label_join(1
- (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"}),
"computed", ",", "instance")
|
ok
|
|
637ms ago
|
1.022ms |
record: node:gpu_used
expr: label_join(sum
by(instance) (container_accelerator_memory_used_bytes{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"})
/ count by(instance) (container_accelerator_duty_cycle{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"}),
"computed", ",", "instance")
|
ok
|
|
636ms ago
|
113.4us |
record: node:gpu_allocate
expr: label_join(sum
by(instance) (container_accelerator_memory_total_bytes{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"})
/ count by(instance) (container_accelerator_duty_cycle{container_name!="deepomatic-shared-gpu-nvidia-device-plugin-ctr"}),
"computed", ",", "instance")
|
ok
|
|
636ms ago
|
74.98us |
record: node:disk_read_iops:1m
expr: label_join(irate(node_disk_reads_completed_total[1m]),
"computed", ",", "instance")
|
ok
|
|
636ms ago
|
56.07us |
record: node:cpu_usage_ratio:1m
expr: label_join(1
- avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m])), "computed",
",", "instance")
|
ok
|
|
636ms ago
|
279us |
record: node:load1
expr: label_join(node_load1,
"computed", ",", "instance")
|
ok
|
|
636ms ago
|
140.1us |
record: node:load5
expr: label_join(node_load5,
"computed", ",", "instance")
|
ok
|
|
636ms ago
|
127.6us |
record: node:load15
expr: label_join(node_load15,
"computed", ",", "instance")
|
ok
|
|
636ms ago
|
123.9us |
record: node:mem_total
expr: label_join(node_memory_MemTotal_bytes,
"computed", ",", "instance")
|
ok
|
|
635ms ago
|
97.7us |
record: node:mem_free
expr: label_join(node_memory_MemFree_bytes,
"computed", ",", "instance")
|
ok
|
|
635ms ago
|
115.8us |
record: node:mem_available
expr: label_join(node_memory_MemAvailable_bytes,
"computed", ",", "instance")
|
ok
|
|
635ms ago
|
122.8us |
record: node:mem_cache
expr: label_join(node_memory_Cached_bytes,
"computed", ",", "instance")
|
ok
|
|
635ms ago
|
164.9us |
record: node:mem_buffers
expr: label_join(node_memory_Buffers_bytes,
"computed", ",", "instance")
|
ok
|
|
635ms ago
|
97.04us |
record: node:gpu_used
expr: label_join(container_accelerator_memory_used_bytes{containenr_name="node-gpu-exporter"},
"computed", ",", "instance")
|
ok
|
|
635ms ago
|
47.25us |
|
12.055s ago |
8.627ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
record: pod:run_time:seconds
expr: label_join(time()
- kube_pod_created, "computed", ",", "pod")
|
ok
|
|
12.055s ago
|
177.8us |
record: pod:filesystem_usage:bytes
expr: label_join(sum
by(pod_name, namespace) (container_fs_usage_bytes{container_name!="POD",image!=""}),
"computed", ",", "pod_name")
|
ok
|
|
12.055s ago
|
2.387ms |
record: pod:cpu_reuqest
expr: label_join(sum
by(pod, namespace) (kube_pod_container_resource_requests{resource="cpu"}),
"computed", ",", "pod")
|
ok
|
|
12.053s ago
|
71.73us |
record: pod:memory_request
expr: label_join(sum
by(pod, namespace) (kube_pod_container_resource_requests{resource="memory"}),
"computed", ",", "pod")
|
ok
|
|
12.053s ago
|
44.81us |
record: pod:cpu_usage
expr: label_join(sum
by(namespace, pod_name) (rate(container_cpu_usage_seconds_total{container_name!="POD",image!=""}[1m])),
"computed", ",", "pod_name")
|
ok
|
|
12.053s ago
|
3.408ms |
record: pod:memory_usage_of_requests
expr: label_join(sum
by(pod_name) (container_memory_rss{container_name!="POD",image!=""}),
"computed", ",", "pod_name")
|
ok
|
|
12.05s ago
|
2.273ms |
record: pod:gpu_smutil
expr: label_join(sum
without(container) (nvidia_gpu_process_smutil), "computed", ",",
"pod_name")
|
ok
|
|
12.047s ago
|
68.21us |
record: pod:gpu_memutil
expr: label_join(sum
without(container) (nvidia_gpu_process_memutil), "computed", ",",
"pod_name")
|
ok
|
|
12.047s ago
|
45us |
record: pod:gpu_decutil
expr: label_join(sum
without(container) (nvidia_gpu_process_decutil), "computed", ",",
"pod_name")
|
ok
|
|
12.047s ago
|
43.33us |
record: pod:gpu_encutil
expr: label_join(sum
without(container) (nvidia_gpu_process_encutil), "computed", ",",
"pod_name")
|
ok
|
|
12.047s ago
|
49.93us |
record: pod:gpu_graph
expr: label_join(sum
without(container) (nvidia_gpu_process_graph), "computed", ",",
"pod_name")
|
ok
|
|
12.047s ago
|
40.08us |