使用 VictoriaMetrics 监控 K8s 集群
过去几年,Kubernetes 已经成为容器编排的标准,越来越多的公司开始在生产系统使用 Kubernetes。通常我们使用 Prometheus 对 K8S 集群进行监控,但由于 Prometheus 自身单点的问题。不得不寻求一些联邦方案或者分布式高可用方案,社区热度比较高的项目有 Thanos,Cortex,VictoriaMetrics。本文就介绍使用 VictoriaMetrics 作为数据存储后端对 K8S 集群进行监控,k8s 部署不再具体描述。
环境版本
实验使用单节点 k8s 网络组件使用 cilium VictoriaMetrics 存储使用 localpv
[root@cilium-1 victoria-metrics-cluster]# cat /etc/redhat-release
CentOS Linux release 7.9.2009 (Core)
[root@cilium-bgp-1 victoria-metrics-cluster]# uname -r
4.19.110-300.el7.x86_64
[root@cilium-1 pvs]# kubectl get node
NAME STATUS ROLES AGE VERSION
cilium-1.novalocal Ready master 28m v1.19.4
主要监控目标
master,node 节点负载状态 k8s 组件状态 etcd 状态 k8s 集群资源状态 (deploy,sts,pod...) 用户自定义组件 (主要通过 pod 定义 prometheus.io/scrape 自动上报 target) ...
监控需要部署的组件
VictoriaMetrics(storage,insert,select,agent,vmalert) promxy kube-state-metrics node-exporter karma alertmanager grafana ...
部署 VictoriaMetrics
创建 localpv 为 storage 组件提供 StorageClass 也可以使用其他网络存储
---
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: vm-disks
provisioner: kubernetes.io/no-provisioner
reclaimPolicy: Retain
volumeBindingMode: WaitForFirstConsumer
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: vm-1
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Delete
storageClassName: vm-disks
local:
path: /mnt/vm style="margin: 0px; padding: 0px; outline: 0px; max-width: ; box-sizing: border-box !important; overflow-wrap: break-word !important;"> nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- cilium-1.novalocal
---
...
[root@cilium-1 pvs]# kubectl get sc
NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
vm-disks kubernetes.io/no-provisioner Retain WaitForFirstConsumer false 6m5s
[root@cilium-1 pvs]# kubectl get pv
NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE
vm-1 10Gi RWO Delete Available vm-disks 92s
vm-2 10Gi RWO Delete Available vm-disks 92s
vm-3 10Gi RWO Delete Available vm-disks 92s
使用 helm 进行安装
添加 helm repo 拉取 chart 包并解压
$ helm repo add vm https://victoriametrics.github.io/helm-charts/
$ helm repo update
$ helm fetch vm/victoria-metrics-cluster
$ tar -xf victoria-metrics-cluster-0.8.25.tgz
Chart.yaml README.md README.md.gotmpl templates values.yaml
根据自己的需求修改 values.yaml
这里我主要修改 vmstorage 组件配置 storageclass
# values.yaml
# Default values for victoria-metrics.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
# -- k8s cluster domain suffix, uses for building stroage pods' FQDN. Ref: [https://kubernetes.io/docs/tasks/administer-cluster/dns-custom-nameservers/](https://kubernetes.io/docs/tasks/administer-cluster/dns-custom-nameservers/ "https://kubernetes.io/docs/tasks/administer-cluster/dns-custom-nameservers/")
clusterDomainSuffix: cluster.local
printNotes: true
rbac:
create: true
pspEnabled: true
namespaced: false
extraLabels: {}
# annotations: {}
serviceAccount:
create: true
# name:
extraLabels: {}
# annotations: {}
# mount API token to pod directly
automountToken: true
extraSecrets:
[]
# - name: secret-remote-storage-keys
# annotations: []
# labels: []
# data: |
# credentials: b64_encoded_str
vmselect:
# -- 为vmselect组件创建deployment. 如果有缓存数据的需要,也可以创建为
enabled: true
# -- Vmselect container name
name: vmselect
image:
# -- Image repository
repository: victoriametrics/vmselect
# -- Image tag
tag: v1.59.0-cluster
# -- Image pull policy
pullPolicy: IfNotPresent
# -- Name of Priority Class
priorityClassName: ""
# -- Overrides the full name of vmselect component
fullnameOverride: ""
# -- Suppress rendering `--storageNode` FQDNs based on `vmstorage.replicaCount` value. If true suppress rendering `--stroageNodes`, they can be re-defined in exrtaArgs
suppresStorageFQDNsRender: false
automountServiceAccountToken: true
# Extra command line arguments for vmselect component
extraArgs:
envflag.enable: "true"
envflag.prefix: VM_
loggerFormat: json
annotations: {}
extraLabels: {}
env: []
# Readiness & Liveness probes
probe:
readiness:
initialDelaySeconds: 5
periodSeconds: 15
timeoutSeconds: 5
failureThreshold: 3
liveness:
initialDelaySeconds: 5
periodSeconds: 15
timeoutSeconds: 5
failureThreshold: 3
# Additional hostPath mounts
extraHostPathMounts:
[]
# - name: certs-dir
# mountPath: /etc/kubernetes/certs
# subPath: ""
# hostPath: /etc/kubernetes/certs
# readOnly: true
# Extra Volumes for the pod
extraVolumes:
[]
# - name: example
# configMap:
# name: example
# Extra Volume Mounts for the container
extraVolumeMounts:
[]
# - name: example
# mountPath: /example
extraContainers:
[]
# - name: config-reloader
# image: reloader-image
initContainers:
[]
# - name: example
# image: example-image
podDisruptionBudget:
# -- See `kubectl explain poddisruptionbudget.spec` for more. Ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/
enabled: false
# minAvailable: 1
# maxUnavailable: 1
labels: {}
# -- Array of tolerations object. Ref: [https://kubernetes.io/docs/concepts/configuration/assign-pod-node/](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ "https://kubernetes.io/docs/concepts/configuration/assign-pod-node/")
tolerations: []
# - key: "key"
# operator: "Equal|Exists"
# value: "value"
# effect: "NoSchedule|PreferNoSchedule"
# -- Pod's node selector. Ref: [https://kubernetes.io/docs/user-guide/node-selection/](https://kubernetes.io/docs/user-guide/node-selection/ "https://kubernetes.io/docs/user-guide/node-selection/")
nodeSelector: {}
# -- Pod affinity
affinity: {}
# -- Pod's annotations
podAnnotations: {}
# -- Count of vmselect pods
replicaCount: 2
# -- Resource object
resources: {}
# limits:
# cpu: 50m
# memory: 64Mi
# requests:
# cpu: 50m
# memory: 64Mi
# -- Pod's security context. Ref: [https://kubernetes.io/docs/tasks/configure-pod-container/security-context/](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/
securityContext: {}
podSecurityContext: {}
# -- Cache root folder
cacheMountPath: /cache
service:
# -- Service annotations
annotations: {}
# -- Service labels
labels: {}
# -- Service ClusterIP
clusterIP: ""
# -- Service External IPs. Ref: [https://kubernetes.io/docs/user-guide/services/#external-ips](https://kubernetes.io/docs/user-guide/services/#external-ips "https://kubernetes.io/docs/user-guide/services/#external-ips")
externalIPs: []
# -- Service load balacner IP
loadBalancerIP: ""
# -- Load balancer source range
loadBalancerSourceRanges: []
# -- Service port
servicePort: 8481
# -- Service type
type: ClusterIP
ingress:
# -- Enable deployment of ingress for vmselect component
enabled: false
# -- Ingress annotations
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: 'true'
extraLabels: {}
# -- Array of host objects
hosts: []
# - name: vmselect.local
# path: /select
# port: http
# -- Array of TLS objects
tls: []
# - secretName: vmselect-ingress-tls
# hosts:
# - vmselect.local
statefulSet:
# -- Deploy StatefulSet instead of Deployment for vmselect. Useful if you want to keep cache data. Creates statefulset instead of deployment, useful when you want to keep the cache
enabled: false
# -- Deploy order policy for StatefulSet pods
podManagementPolicy: OrderedReady
## Headless service for statefulset
service:
# -- Headless service annotations
annotations: {}
# -- Headless service labels
labels: {}
# -- Headless service port
servicePort: 8481
persistentVolume:
# -- Create/use Persistent Volume Claim for vmselect component. Empty dir if false. If true, vmselect will create/use a Persistent Volume Claim
enabled: false
# -- Array of access mode. Must match those of existing PV or dynamic provisioner. Ref: [http://kubernetes.io/docs/user-guide/persistent-volumes/](http://kubernetes.io/docs/user-guide/persistent-volumes/ "http://kubernetes.io/docs/user-guide/persistent-volumes/")
accessModes:
- ReadWriteOnce
# -- Persistent volume annotations
annotations: {}
# -- Existing Claim name. Requires vmselect.persistentVolume.enabled: true. If defined, PVC must be created manually before volume will be bound
existingClaim: ""
## Vmselect data Persistent Volume mount root path
##
# -- Size of the volume. Better to set the same as resource limit memory property
size: 2Gi
# -- Mount subpath
subPath: ""
serviceMonitor:
# -- Enable deployment of Service Monitor for vmselect component. This is Prometheus operator object
enabled: false
# -- Target namespace of ServiceMonitor manifest
namespace: ""
# -- Service Monitor labels
extraLabels: {}
# -- Service Monitor annotations
annotations: {}
# Commented. Prometheus scare interval for vmselect component
# interval: 15s
# Commented. Prometheus pre-scrape timeout for vmselect component
# scrapeTimeout: 5s
vminsert:
# -- Enable deployment of vminsert component. Deployment is used
enabled: true
# -- vminsert container name
name: vminsert
image:
# -- Image repository
repository: victoriametrics/vminsert
# -- Image tag
tag: v1.59.0-cluster
# -- Image pull policy
pullPolicy: IfNotPresent
# -- Name of Priority Class
priorityClassName: ""
# -- Overrides the full name of vminsert component
fullnameOverride: ""
# Extra command line arguments for vminsert component
extraArgs:
envflag.enable: "true"
envflag.prefix: VM_
loggerFormat: json
annotations: {}
extraLabels: {}
env: []
# -- Suppress rendering `--storageNode` FQDNs based on `vmstorage.replicaCount` value. If true suppress rendering `--stroageNodes`, they can be re-defined in exrtaArgs
suppresStorageFQDNsRender: false
automountServiceAccountToken: true
# Readiness & Liveness probes
probe:
readiness:
initialDelaySeconds: 5
periodSeconds: 15
timeoutSeconds: 5
failureThreshold: 3
liveness:
initialDelaySeconds: 5
periodSeconds: 15
timeoutSeconds: 5
failureThreshold: 3
initContainers:
[]
# - name: example
# image: example-image
podDisruptionBudget:
# -- See `kubectl explain poddisruptionbudget.spec` for more. Ref: [https://kubernetes.io/docs/tasks/run-application/configure-pdb/](https://kubernetes.io/docs/tasks/run-application/configure-pdb/ "https://kubernetes.io/docs/tasks/run-application/configure-pdb/")
enabled: false
# minAvailable: 1
# maxUnavailable: 1
labels: {}
# -- Array of tolerations object. Ref: [https://kubernetes.io/docs/concepts/configuration/assign-pod-node/](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ "https://kubernetes.io/docs/concepts/configuration/assign-pod-node/")
tolerations: []
# - key: "key"
# operator: "Equal|Exists"
# value: "value"
# effect: "NoSchedule|PreferNoSchedule"
# -- Pod's node selector. Ref: [https://kubernetes.io/docs/user-guide/node-selection/](https://kubernetes.io/docs/user-guide/node-selection/ "https://kubernetes.io/docs/user-guide/node-selection/")
nodeSelector: {}
# -- Pod affinity
affinity: {}
# -- Pod's annotations
podAnnotations: {}
# -- Count of vminsert pods
replicaCount: 2
# -- Resource object
resources: {}
# limits:
# cpu: 50m
# memory: 64Mi
# requests:
# cpu: 50m
# memory: 64Mi
# -- Pod's security context. Ref: [https://kubernetes.io/docs/tasks/configure-pod-container/security-context/](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ "https://kubernetes.io/docs/tasks/configure-pod-container/security-context/")
securityContext: {}
podSecurityContext: {}
service:
# -- Service annotations
annotations: {}
# -- Service labels
labels: {}
# -- Service ClusterIP
clusterIP: ""
# -- Service External IPs. Ref: [https://kubernetes.io/docs/user-guide/services/#external-ips](https://kubernetes.io/docs/user-guide/services/#external-ips "https://kubernetes.io/docs/user-guide/services/#external-ips")
externalIPs: []
# -- Service load balancer IP
loadBalancerIP: ""
# -- Load balancer source range
loadBalancerSourceRanges: []
# -- Service port
servicePort: 8480
# -- Service type
type: ClusterIP
ingress:
# -- Enable deployment of ingress for vminsert component
enabled: false
# -- Ingress annotations
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: 'true'
extraLabels: {}
# -- Array of host objects
hosts: []
# - name: vminsert.local
# path: /insert
# port: http
# -- Array of TLS objects
tls: []
# - secretName: vminsert-ingress-tls
# hosts:
# - vminsert.local
serviceMonitor:
# -- Enable deployment of Service Monitor for vminsert component. This is Prometheus operator object
enabled: false
# -- Target namespace of ServiceMonitor manifest
namespace: ""
# -- Service Monitor labels
extraLabels: {}
# -- Service Monitor annotations
annotations: {}
# Commented. Prometheus scare interval for vminsert component
# interval: 15s
# Commented. Prometheus pre-scrape timeout for vminsert component
# scrapeTimeout: 5s
vmstorage:
# -- Enable deployment of vmstorage component. StatefulSet is used
enabled: true
# -- vmstorage container name
name: vmstorage
image:
# -- Image repository
repository: victoriametrics/vmstorage
# -- Image tag
tag: v1.59.0-cluster
# -- Image pull policy
pullPolicy: IfNotPresent
# -- Name of Priority Class
priorityClassName: ""
# -- Overrides the full name of vmstorage component
fullnameOverride:
automountServiceAccountToken: true
env: []
# -- Data retention period. Supported values 1w, 1d, number without measurement means month, e.g. 2 = 2month
retentionPeriod: 1
# Additional vmstorage container arguments. Extra command line arguments for vmstorage component
extraArgs:
envflag.enable: "true"
envflag.prefix: VM_
loggerFormat: json
# Additional hostPath mounts
extraHostPathMounts:
[]
# - name: certs-dir
# mountPath: /etc/kubernetes/certs
# subPath: ""
# hostPath: /etc/kubernetes/certs
# readOnly: true
# Extra Volumes for the pod
extraVolumes:
[]
# - name: example
# configMap:
# name: example
# Extra Volume Mounts for the container
extraVolumeMounts:
[]
# - name: example
# mountPath: /example
extraContainers:
[]
# - name: config-reloader
# image: reloader-image
initContainers:
[]
# - name: vmrestore
# image: victoriametrics/vmrestore:latest
# volumeMounts:
# - mountPath: /storage
# name: vmstorage-volume
# - mountPath: /etc/vm/creds
# name: secret-remote-storage-keys
# readOnly: true
# args:
# - -storageDataPath=/storage
# - -src=s3://your_bucket/folder/latest
# - -credsFilePath=/etc/vm/creds/credentials
# -- See `kubectl explain poddisruptionbudget.spec` for more. Ref: [https://kubernetes.io/docs/tasks/run-application/configure-pdb/](https://kubernetes.io/docs/tasks/run-application/configure-pdb/ "https://kubernetes.io/docs/tasks/run-application/configure-pdb/")
podDisruptionBudget:
enabled: false
# minAvailable: 1
# maxUnavailable: 1
labels: {}
# -- Array of tolerations object. Node tolerations for server scheduling to nodes with taints. Ref: [https://kubernetes.io/docs/concepts/configuration/assign-pod-node/](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ "https://kubernetes.io/docs/concepts/configuration/assign-pod-node/")
##
tolerations:
[]
# - key: "key"
# operator: "Equal|Exists"
# value: "value"
# effect: "NoSchedule|PreferNoSchedule"
# -- Pod's node selector. Ref: [https://kubernetes.io/docs/user-guide/node-selection/](https://kubernetes.io/docs/user-guide/node-selection/ "https://kubernetes.io/docs/user-guide/node-selection/")
nodeSelector: {}
# -- Pod affinity
affinity: {}
#
#
#
# schedulerName:
persistentVolume:
# -- Create/use Persistent Volume Claim for vmstorage component. Empty dir if false. If true, vmstorage will create/use a Persistent Volume Claim
enabled: true
# -- Array of access modes. Must match those of existing PV or dynamic provisioner. Ref: [http://kubernetes.io/docs/user-guide/persistent-volumes/](http://kubernetes.io/docs/user-guide/persistent-volumes/ "http://kubernetes.io/docs/user-guide/persistent-volumes/")
accessModes:
- ReadWriteOnce
# -- Persistent volume annotations
annotations: {}
# -- Storage class name. Will be empty if not setted
storageClass: "vm-disks" 为vm-storage指定storageclass
# -- Existing Claim name. Requires vmstorage.persistentVolume.enabled: true. If defined, PVC must be created manually before volume will be bound
existingClaim: ""
# -- Data root path. Vmstorage data Persistent Volume mount root path
mountPath: /storage
# -- Size of the volume. Better to set the same as resource limit memory property
size: 8Gi
# -- Mount subpath
subPath: ""
# -- Pod's annotations
podAnnotations: {}
annotations: {}
extraLabels: {}
# -- Count of vmstorage pods
replicaCount: 3
# -- Deploy order policy for StatefulSet pods
podManagementPolicy: OrderedReady
# -- Resource object. Ref: [http://kubernetes.io/docs/user-guide/compute-resources/](http://kubernetes.io/docs/user-guide/compute-resources/ "http://kubernetes.io/docs/user-guide/compute-resources/")
resources: {}
# limits:
# cpu: 500m
# memory: 512Mi
# requests:
# cpu: 500m
# memory: 512Mi
# -- Pod's security context. Ref: [https://kubernetes.io/docs/tasks/configure-pod-container/security-context/](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ "https://kubernetes.io/docs/tasks/configure-pod-container/security-context/")
securityContext: {}
podSecurityContext: {}
service:
# -- Service annotations
annotations: {}
# -- Service labels
labels: {}
# -- Service port
servicePort: 8482
# -- Port for accepting connections from vminsert
vminsertPort: 8400
# -- Port for accepting connections from vmselect
vmselectPort: 8401
# -- Pod's termination grace period in seconds
terminationGracePeriodSeconds: 60
probe:
readiness:
initialDelaySeconds: 5
periodSeconds: 15
timeoutSeconds: 5
failureThreshold: 3
liveness:
initialDelaySeconds: 5
periodSeconds: 15
timeoutSeconds: 5
failureThreshold: 3
serviceMonitor:
# -- Enable deployment of Service Monitor for vmstorage component. This is Prometheus operator object
enabled: false
# -- Target namespace of ServiceMonitor manifest
namespace: ""
# -- Service Monitor labels
extraLabels: {}
# -- Service Monitor annotations
annotations: {}
# Commented. Prometheus scare interval for vmstorage component
# interval: 15s
# Commented. Prometheus pre-scrape timeout for vmstorage component
# scrapeTimeout: 5s
部署
$ kubectl create ns vm
$ helm install vm -n vm ./
# 如果需要输出渲染好的yaml文件可以添加参数--debug --dry-run
查看创建资源
[root@cilium-1 ~]# kubectl get po -n vm
NAME READY STATUS RESTARTS AGE
vm-victoria-metrics-cluster-vminsert-559db87988-cnb7g 1/1 Running 0 5m
vm-victoria-metrics-cluster-vminsert-559db87988-jm4cj 1/1 Running 0 5m
vm-victoria-metrics-cluster-vmselect-b77474bcf-6rrcz 1/1 Running 0 5m
vm-victoria-metrics-cluster-vmselect-b77474bcf-dsl4j 1/1 Running 0 5m
vm-victoria-metrics-cluster-vmstorage-0 1/1 Running 0 5m
vm-victoria-metrics-cluster-vmstorage-1 1/1 Running 0 5m
vm-victoria-metrics-cluster-vmstorage-2 1/1 Running 0 5m
部署 kube-state-metrics
kube-state-metrics[1] 是一个简单的服务,它监听 Kubernetes API 服务器并生成关于对象状态的指标。
根据集群的 k8s 版本选择合适的 kube-state-metrics 版本
kube-state-metrics | Kubernetes 1.16 | Kubernetes 1.17 | Kubernetes 1.18 | Kubernetes 1.19 | Kubernetes 1.20 |
---|---|---|---|---|---|
v1.8.0 | - | - | - | - | - |
v1.9.8 | ✓ | - | - | - | - |
v2.0.0 | - | -/✓ | -/✓ | ✓ | ✓ |
master | - | -/✓ | -/✓ | ✓ | ✓ |
✓
Fully supported version range.-
The Kubernetes cluster has features the client-go library can't use (additional API objects, deprecated APIs, etc).
本文选用 v2.0.0
$ git clone https://github.com/kubernetes/kube-state-metrics.git -b release-2.0
$ cd kube-state-metrics/examples/autosharding
# 主要文件
$ ls
cluster-role-binding.yaml cluster-role.yaml role-binding.yaml role.yaml service-account.yaml service.yaml statefulset.yaml
$ kubectl apply -f ./
由于网络问题可能会遇到 kube-state-metrics 镜像无法拉取的情况,可以修改 statefulset.yaml 使用 bitnami/kube-state-metrics:2.0.0
部署 node_exporter
node-exporter 用于采集服务器层面的运行指标,包括机器的 loadavg、filesystem、meminfo 等基础监控,类似于传统主机监控维度的 zabbix-agent。
采用 daemonset 的方式部署在 K8S 集群,并通过 scrape 注释。让 vmagent 自动添加 targets。
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: node-exporter
labels:
k8s-app: node-exporter
spec:
selector:
matchLabels:
k8s-app: node-exporter
template:
metadata:
labels:
k8s-app: node-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
prometheus.io/path: "/metrics"
spec:
containers:
- name: node-exporter
image: quay.io/prometheus/node-exporter:v1.1.2
ports:
- name: metrics
containerPort: 9100
args:
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--path.rootfs=/host"
volumeMounts:
- name: dev
mountPath: /host/dev
- name: proc
mountPath: /host/proc
- name: sys
mountPath: /host/sys
- name: rootfs
mountPath: /host
volumes:
- name: dev
hostPath:
path: /dev
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: rootfs
hostPath:
path: /
hostPID: true
hostNetwork: true
tolerations:
- operator: "Exists"
部署 vmagent
由于连接 etcd 需要配置证书,由于我们的集群是使用 kubeadm 部署的,所以先在主节点创建 etcd secret。
$ kubectl -n vm create secret generic etcd-certs --from-file=/etc/kubernetes/pki/etcd/healthcheck-client.crt --from-file=/etc/kubernetes/pki/etcd/healthcheck-client.key --from-file=/etc/kubernetes/pki/etcd/ca.crt
如果独立部署的 etcd 集群,同样将证书保存到集群中的一个 secret 对象中去即可。
若监控 kube-controller-manager 和 kube-scheduler 需要修改 / etc/kubernetes/manifests 下面 kube-controller-manager.yaml kube-scheduler.yaml 将- --bind-address=127.0.0.1
改为 0.0.0.0
---
apiVersion: v1
kind: Service
metadata:
labels:
app: vmagent-k8s
name: vmagent-k8s
namespace: vm
spec:
ports:
- port: 8429
protocol: TCP
targetPort: http
name: http
selector:
app: vmagent-k8s
type: NodePort
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: vmagent-k8s
namespace: vm
labels:
app: vmagent-k8s
spec:
serviceName: "vmagent-k8s"
replicas: 1
selector:
matchLabels:
app: vmagent-k8s
template:
metadata:
labels:
app: vmagent-k8s
spec:
serviceAccountName: vmagent-k8s
containers:
- name: vmagent-k8s
image: victoriametrics/vmagent:v1.59.0
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
args:
- -promscrape.config=/etc/prometheus/prometheus.yaml
- -remoteWrite.tmpDataPath=/vmtmp
- -remoteWrite.url=http://vmetric-victoria-metrics-cluster-vminsert.vm.svc.cluster.local:8480/insert/0/prometheus/
ports:
- name: http
containerPort: 8429
volumeMounts:
- name: time
mountPath: /etc/localtime
readOnly: true
- name: config
mountPath: /etc/prometheus/
- mountPath: "/etc/kubernetes/pki/etcd/" 挂载etcd secret 用于连接etcd接口
name: etcd-certs
- mountPath: "/vmtmp"
name: tmp
volumes:
- name: "tmp"
emptyDir: {}
- name: time
hostPath:
path: /etc/localtime
- name: config
configMap:
name: vmagent-k8s
- name: etcd-certs
secret:
secretName: etcd-certs
updateStrategy:
type: RollingUpdate
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: vmagent-k8s
namespace: vm
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: vmagent-k8s
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: vmagent-k8s
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: vmagent-k8s
subjects:
- kind: ServiceAccount
name: vmagent-k8s
namespace: vm
---
apiVersion: v1
kind: ConfigMap
metadata:
name: vmagent-k8s
namespace: vm
data:
prometheus.yaml: |-
global:
scrape_interval: 60s
scrape_timeout: 60s
external_labels:
cluster: test # 根据需求添加自定义标签
datacenter: test
scrape_configs:
- job_name: etcd
scheme: https
tls_config:
insecure_skip_verify: true
ca_file: /etc/kubernetes/pki/etcd/ca.crt
cert_file: /etc/kubernetes/pki/etcd/healthcheck-client.crt
key_file: /etc/kubernetes/pki/etcd/healthcheck-client.key
static_configs:
- targets:
- 192.168.0.1:2379
- job_name: kube-scheduler
scheme: http
static_configs:
- targets:
- 192.168.0.1:10259
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kube-controller-manager
scheme: http
static_configs:
- targets:
- 192.168.0.1:10257
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kube-apiserver
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: default;kubernetes;https
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-nodes
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-cadvisor
scrape_interval: 15s
scrape_timeout: 15s
kubernetes_sd_configs:
- role: node
relabel_configs:
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-service-endpoints
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: kubernetes_name
- job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: kubernetes_pod_name
$ kubectl apply -f vmagent.yaml
可以通过浏览器访问 agent nodeport /targets 接口查看监控对象
部署 promxy
由于 VictoriaMetrics 没有查询 UI,并且不提供 Remote_read 的功能。所以可以借助第三方工具 Promxy 实现类似于 Prometheus 界面填充 promql 查询数据的功能。
promxy 是一个聚合 proxy 可以用来实现 prometheus 的 ha 详细的相关介绍可以参考 github[2] 文档,是一个值得尝试的工具,同时 VictoriaMetrics 对于自己的一些短板 也推荐了此工具。
[root@cilium-1 promxy]# cat promxy.yaml
apiVersion: v1
data:
config.yaml: |
### Promxy configuration 仅需要配置victoriametrics select组件地址及接口
promxy:
server_groups:
- static_configs:
- targets:
- vm-victoria-metrics-cluster-vmselect.vm.svc.cluster.local:8481
path_prefix: /select/0/prometheus
kind: ConfigMap
metadata:
name: promxy-config
namespace: vm
---
apiVersion: v1
kind: Service
metadata:
labels:
app: promxy
name: promxy
namespace: vm
spec:
ports:
- name: promxy
port: 8082
protocol: TCP
targetPort: 8082
type: NodePort
selector:
app: promxy
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: promxy
name: promxy
namespace: vm
spec:
replicas: 1
selector:
matchLabels:
app: promxy
template:
metadata:
labels:
app: promxy
spec:
containers:
- args:
- "--config=/etc/promxy/config.yaml"
- "--web.enable-lifecycle"
command:
- "/bin/promxy"
image: quay.io/jacksontj/promxy:latest
imagePullPolicy: Always
livenessProbe:
httpGet:
path: "/-/healthy"
port: 8082
initialDelaySeconds: 3
name: promxy
ports:
- containerPort: 8082
readinessProbe:
httpGet:
path: "/-/ready"
port: 8082
initialDelaySeconds: 3
volumeMounts:
- mountPath: "/etc/promxy/"
name: promxy-config
readOnly: true
- args:
- "--volume-dir=/etc/promxy"
- "--webhook-url=http://localhost:8082/-/reload"
image: jimmidyson/configmap-reload:v0.1
name: promxy-server-configmap-reload
volumeMounts:
- mountPath: "/etc/promxy/"
name: promxy-config
readOnly: true
volumes:
- configMap:
name: promxy-config
name: promxy-config
相关文章