prometheus query

# PromQL(Prometheus Query Language) 사용 예시

# https://prometheus.io/docs/prometheus/latest/querying/operators

# https://prometheus.io/docs/prometheus/latest/querying/functions

# 버전 조회

prometheus_build_info

# my_requests 메트릭 모두 조회

my_requests

# my_requests 중 abc label 값이 lemon 인 경우 조회

my_requests{abc="lemon"}

# my_requests 중 abc label 값이 lemon 이 아닌 경우 조회

my_requests{abc!="lemon"}

# my_requests 중 abc label 값이 lemon로 시작하는 경우 조회

# =~ 로 regex match 한다.

my_requests{abc=~"^lemon.*"}

# 특정 시간에서의 조회

my_requests{abc=~"^lemon.*"} @유닉스타임

# 30분 전의 값 조회

my_requests{abc=~"^lemon.*"} offset 30m

# [1m] : 최근 1분 동안 수집된 n 개의 값들(range vector)

my_requests{abc=~"^lemon.*"}[1m]

# 처음과 끝의 차이로 초당 평균 변화율(rate)을 계산한다.

# 처음과 시작값 외 중간의 값들은 사용되지 않기 때문에

# range vector 를 너무 크게하면 정확한 값이 도출되지 않는다.

# abc=lemon 요청이 1분동안 60번의 요청이 있었다면 rate 로 1tps가 된다.

rate(my_requests{abc="lemon"}[1m])

# tps 같은 변화율이 아닌 실제 카운트 값을 취할때는 increase 를 사용하면 된다.

# abc=lemon 요청이 1분동안 60번의 요청이 있었다면 increase 는 60(개)가 된다.

increase(my_requests{abc="lemon"}[1m])

# rate 로 나오는 n 개의 값을 더한다.

sum(rate(my_requests{abc="lemon"}[1m]))

# rate 로 나오는 n 개의 값 평균을 계산한다.

avg(rate(my_requests{abc="lemon"}[1m]))

# rate 로 나오는 n 개의 값중 최소값

min(rate(my_requests{abc="lemon"}[1m]))

# rate 로 나오는 n 개의 값중 최대값

max(rate(my_requests{abc="lemon"}[1m]))

# [1d:1h] : 최근 2일 동안 1시간 간격의 값들

# 하루 중 1분단위로 rate 합산 결과들 중 최대 값

max_over_time(sum(rate(my_requests{abc="lemon"}[1m]))[1d:1m])

# 하루 중 1분단위로 rate 합산 결과들 중 평균 값

avg_over_time(sum(rate(my_requests{abc="lemon"}[1m]))[1d:1m])

# 하루 중 1분단위로 rate 합산 결과들 중 중간 값

quantile_over_time(0.5, sum(rate(my_requests{abc="lemon"}[1m]))[1d:1m])

# container_cpu_usage_seconds_total(cpu점유시간) 등의 메트릭은

# 시간이 지날수록 계속 증가되는 값들로 rate()로

# 시작과 끝 값을 통해 변화율을 계산하면 cpu사용율을 파악할 수 있다.

# 1m 간 수집된 n 개의 range vector 값들로 cpu증감율(사용율)을 알아낸다.

rate(container_cpu_usage_seconds_total{namespace=~"ysoftman.*"}[1m])

# pod 이 같은 것들을 하나로 더해서 취합

sum by (pod) (rate(container_cpu_usage_seconds_total{namespace=~"ysoftman.*"}[1m]))

# pod 이 같은 것들 중 가장 큰 값만 취합

max by (pod) (rate(container_cpu_usage_seconds_total{namespace=~"ysoftman.*"}[1m]))

# pod,container 별 CPU 사용량 백분율

sum(rate(container_cpu_usage_seconds_total{name!~".*prometheus.*", image!="", container!="POD"}[5m])) by (pod, container) /

sum(container_spec_cpu_quota{name!~".*prometheus.*", image!="", container!="POD"} / container_spec_cpu_period{name!~".*prometheus.*", image!="", container!="POD"}) by (pod, container) * 100

# worker 노드의 pod 최대 개수 파악

# on(양쪽 메트릭에 존재하는 레이블로 결과들을 구분할 수 있는 레이블)

# group_left (왼쪽 매트릭 레이블 기준으로 병합, 병합 결과에 포함할 레이블 명시할 수 있다.)

# sum (결과) by (node) 결과중 node 레이블 같은 것 까리 합치기

sum(kube_node_status_allocatable{resource="pods", unit="integer"} * on(node) group_left() kube_node_role{role="worker"}) by (node)

# worker 노드의 running pod 개수 파악

sum((kube_pod_info * on(pod, namespace) group_right(node) kube_pod_status_phase{phase="Running"}) * on (node) group_left() kube_node_role{role="worker"}) by (node)

#####

# promethus > alerts 메뉴에 등록된 rule 확인을 할 수 있다.

# 알람은 다음 3가지 상태가 있다.

# https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#alerting-rules

- inactive: 알람 (rule)조건이 해당하지 않는 경우(정상)

- pending: 조건에 맞아서 발송 대기중, rule 명세의 for 필드에 설정된 기간동안 검사하며 이 기간 동안 조건이 해제되면 inactive, 계속 조건이 맞으면 firing 상태로 변경된다.

- firing: firing 상태가 되면 alert-manager 에게 알람 내용 전송한다.

# 자주 사용되는 prometheus alert 을 모아둔곳(왠만한건 다있음~ㅎ)

https://github.com/samber/awesome-prometheus-alerts

prometheus query

comments:

댓글 쓰기