第十周作业

prometheus 部署

包安装

# 查询支持的版本
apt-cache madison prometheus 
prometheus | 2.31.2+ds1-1ubuntu1.22.04.2 | http://mirrors.aliyun.com/ubuntu jammy-security/universe amd64 Packages
prometheus | 2.31.2+ds1-1ubuntu1.22.04.2 | http://mirrors.aliyun.com/ubuntu jammy-updates/universe amd64 Packages
prometheus | 2.31.2+ds1-1ubuntu1.22.04.2 | http://cn.archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages
prometheus | 2.31.2+ds1-1ubuntu1.22.04.2 | http://cn.archive.ubuntu.com/ubuntu jammy-security/universe amd64 Packages
prometheus | 2.31.2+ds1-1ubuntu1 | http://mirrors.aliyun.com/ubuntu jammy/universe amd64 Packages
prometheus | 2.31.2+ds1-1ubuntu1 | http://cn.archive.ubuntu.com/ubuntu jammy/universe amd64 Packages

# 安装指定版本
apt install prometheus=2.31.2+ds1-1ubuntu1.22.04.2

# 启动服务
systemctl start prometheus.service

# 检查服务
ss -tulnp | grep -i 9090
tcp   LISTEN 0      4096               *:9090            *:*    users:(("prometheus",pid=2619,fd=4))   

默认service文件

[Unit] 
Description=Monitoring system and time series database
Documentation=https://prometheus.io/docs/introduction/overview/ man:prometheus(1)
After=time-sync.target

[Service]
Restart=on-failure
User=prometheus
EnvironmentFile=/etc/default/prometheus
ExecStart=/usr/bin/prometheus $ARGS
ExecReload=/bin/kill -HUP $MAINPID
TimeoutStopSec=20s
SendSIGKILL=no

# systemd hardening-options
AmbientCapabilities=
CapabilityBoundingSet=
DeviceAllow=/dev/null rw
DevicePolicy=strict
LimitMEMLOCK=0
LimitNOFILE=8192
LockPersonality=true
MemoryDenyWriteExecute=true
NoNewPrivileges=true
PrivateDevices=true
PrivateTmp=true
PrivateUsers=true
ProtectControlGroups=true
ProtectHome=true
ProtectKernelModules=true
ProtectKernelTunables=true
ProtectSystem=full
RemoveIPC=true
RestrictNamespaces=true
RestrictRealtime=true
SystemCallArchitectures=native
[Install]
WantedBy=multi-user.target

默认配置文件

global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  external_labels:
      monitor: 'example'
alerting:
  alertmanagers:
  - static_configs:
    - targets: ['localhost:9093']
rule_files:
scrape_configs:
  - job_name: 'prometheus'
    scrape_interval: 5s
    scrape_timeout: 5s
    static_configs:
      - targets: ['localhost:9090']
  - job_name: node
    static_configs:
      - targets: ['localhost:9100']

源代码

有更自由的版本安装选择,如最新的版本:
github release
prometheus donload page

# 下载软件包
v=2.53.1
cd /usr/local
wget https://github.com/prometheus/prometheus/releases/download/v$v/prometheus-$v.linux-amd64.tar.gz

# 解压包
tar -xvf prometheus-$v.linux-amd64.tar.gz
ln -sv  prometheus-2.53.1.linux-amd64/ prometheus

# 追加路径
# Add prometheus bin path
cat > /etc/profile.d/prometheus.sh <<EOF
PATH=/usr/local/prometheus:$PATH
EOF

sourc /etc/profile.d/prometheus.sh

# 用户创建及权限
useradd -M -r prometheus

# 创建service文件
vim /etc/systemd/system/prometheus.service
mkdir -p /usr/local/prometheus/data # 存储数据
chown -R prometheus:prometheus prometheus
chown -R prometheus:prometheus prometheus/

# 启动服务
systemctl daemon-reload
systemctl start prometheus

默认service文件

# /etc/systemd/system/prometheus.service

[Unit]
Description=Monitoring system and time series database
Documentation=https://prometheus.io/docs/introduction/overview/
After=network-online.target

[Service]
Restart=always
User=prometheus
Group=prometheus
EnvironmentFile=/usr/local/prometheus/prometheus.yml
ExecStart=/usr/local/prometheus/prometheus \
  --config.file=/usr/local/prometheus/prometheus.yml \
  --storage.tsdb.path=/usr/local/prometheus/data \
  --storage.tsdb.retention.time=60d \
  --web.enable-lifecycle \
  --web.listen-address=:9090
ExecReload=/bin/kill -HUP $MAINPID
TimeoutStopSec=20s
SendSIGKILL=no
LimitNOFILE=8192

[Install]
WantedBy=multi-user.target

# 说明
# --config.file=/opt/prometheus/prometheus/prometheus.yml                 #主配置文件
# --storage.tsdb.path=/opt/prometheus/prometheus/data                     #数据库存储目录
# --web.console.libraries=/opt/prometheus/prometheus/console_libraries    #指定控制台库目录路径
# --web.console.templates=/opt/prometheus/prometheus/consoles             #指定控制台模版目录路径
# --storage.tsdb.retention=60d                                            #指明数据保留天数,默认15天
# --web.enable-lifecycle                                                  #热加载
# --web.listen-address=:9090                                              #默认监听端口

默认配置文件

global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
  alertmanagers:
    - static_configs:
        - targets:
rule_files:
scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]

容器部署

# 准备好prometheus.yml文件
# eg:
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
  alertmanagers:
    - static_configs:
        - targets:
rule_files:
scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]

# 创建容器
v=latest
mkdir -p /data/prometheus
chown -R nobody:nobody /data/prometheus/

docker run --name prometheus -d -p 9090:9090 \
-v /path/to/prometheus.yml:/etc/prometheus/prometheus.yml \
-v /data/prometheus:/prometheus \
-v /etc/localtime:/etc/localtime:ro \
prom/prometheus:$v

# 查看容器运行
CONTAINER ID   IMAGE                    COMMAND                  CREATED          STATUS          PORTS                                       NAMES
9a652b44f878   prom/prometheus:latest   "/bin/prometheus --c…"   11 minutes ago   Up 11 minutes   0.0.0.0:9090->9090/tcp, :::9090->9090/tcp   prometheus

故障排查

/data/prometheus 目录权限设定为prometheus 时候,运行报错/prometheus/queries.active: permission denied"

解决方式

  1. 进入容器使用whomai查询用户为nobody
  2. 修改 /data/prometheus 目录权限
    sudo mkdir -p /data/prometheus
    sudo chown -R 65534:65534 /data/prometheus
    OR
    sudo chown -R nobody:nobody /data/prometheus

K8S部署

# 创建ns
kubectl create namespace prometheus

# 创建配置文件 prometheus-config.yaml
kubectl apply -f prometheus-config.yaml -n prometheus

apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-server-conf
  namespace: prometheus
data:
  prometheus.yml: |
    global:
      scrape_interval: 15s
      evaluation_interval: 15s
    scrape_configs:
      - job_name: 'prometheus'
        static_configs:
          - targets: ['localhost:9090']

# 创建对应deployment prometheus-deployment.yaml
kubectl apply -f prometheus-deployment.yaml -n prometheus

apiVersion: apps/v1
kind: Deployment
metadata:
  name: prometheus-server
  namespace: prometheus
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus-server
  template:
    metadata:
      labels:
        app: prometheus-server
    spec:
      containers:
        - name: prometheus
          image: prom/prometheus
          ports:
            - containerPort: 9090
          volumeMounts:
            - name: config-volume
              mountPath: /etc/prometheus
      volumes:
        - name: config-volume
          configMap:
            name: prometheus-server-conf
            defaultMode: 420

# 暴露服务 prometheus-service.yaml
kubectl apply -f prometheus-service.yaml -n prometheus

apiVersion: v1
kind: Service
metadata:
  name: prometheus-service
  namespace: prometheus
spec:
  selector:
    app: prometheus-server
  ports:
    - protocol: TCP
      port: 9090
      targetPort: 9090
  type: LoadBalancer

# 检查应用
kubectl get service prometheus-service -n prometheus
NAME                 TYPE           CLUSTER-IP      EXTERNAL-IP      PORT(S)          AGE
prometheus-service   LoadBalancer   10.99.133.211   192.168.100.51   9090:32033/TCP   12m

监控主机

二进制

# 源下载安装, 需要自行配置service文件
https://prometheus.io/download/#node_exporter

v=1.8.2
wget https://github.com/prometheus/node_exporter/releases/download/v$v/node_exporter-$v.linux-amd64.tar.gz

#解压软件包
tar -xvf node_exporter-1.8.2.linux-amd64.tar.gz  -C /opt/

#做软连接
cd /opt
mv node_exporter-$v.linux-amd64 node_exporter-$v
ln -s node_exporter-$v node_exporter

# 修改权限
chown prometheus:prometheus /opt/node_exporter/node_exporter

#做systemd启动文件
cat > /usr/lib/systemd/system/node_exporter.service <<EOF
[Unit]
Description=Prometheus node_exporter

[Service]
User=prometheus
Group=prometheus
ExecStart=/opt/node_exporter/node_exporter --log.level=error
ExecStop=/usr/bin/killall node_exporter

[Install]
WantedBy=default.target
EOF

包安装

# 查询所有的 exporter
apt-cache search prometheus-.*-exporter

# 过滤 node 关键字
apt-cache search prometheus-.*-exporter | grep node

# 查询版本
apt-cache madison prometheus-node-exporter

# 安装
apt install prometheus-node-exporter=1.3.1-1ubuntu0.22.04.2

# 修改权限
chown prometheus:prometheus /usr/bin/prometheus-node-exporter
chown prometheus:prometheus /etc/default/prometheus-node-exporter

# 启动服务
systemctl start  prometheus-node-exporter.service

测试访问

# 检查端口状态
ss -tulnp | grep 9100
tcp   LISTEN 0      4096               *:9100            *:*    users:(("prometheus-node",pid=62699,fd=3))

# 访问测试
curl -sq 127.0.0.1:9100/metrics | grep node_cpu_seconds_total
# HELP node_cpu_seconds_total Seconds the CPUs spent in each mode.
# TYPE node_cpu_seconds_total counter
node_cpu_seconds_total{cpu="0",mode="idle"} 47755.93
node_cpu_seconds_total{cpu="0",mode="iowait"} 6.04
node_cpu_seconds_total{cpu="0",mode="irq"} 0
node_cpu_seconds_total{cpu="0",mode="nice"} 0.12
...

监控配置

scrape_configs:
  - job_name: "localhost-node"
    static_configs:
      - targets: ["localhost:9100"]


容器监控

cAdvisor

CAdvisor是Google开源的一款用于展示和分析容器运行状态的可视化工具。通过在主机上运行CAdvisor用户可以轻松的获取到当前主机上容器的运行统计信息,并以图表的形式向用户展示。

!!! warning 注意事项

CAdvisor默认只保存2分钟的监控数据。

用户可通过访问 http://localhost:8080/metrics ,获取其暴露的metrices
metric explain
!!!

  1. 创建cadvisor容器

    # 启动 cadvisor
    docker run \
    --volume=/:/rootfs:ro \
    --volume=/var/run:/var/run:ro \
    --volume=/sys:/sys:ro \
    --volume=/var/lib/docker/:/var/lib/docker:ro \
    --volume=/dev/disk/:/dev/disk:ro \
    --publish=8080:8080 \
    --detach=true \
    --name=cadvisor \
    --privileged \
    --device=/dev/kmsg \
    gcr.io/cadvisor/cadvisor:latest  
  2. 配置prometheus.yaml中的采集任务

    - job_name: cadvisor
      static_configs:
      - targets:
        - localhost:8080
  3. 查询cAdvisor数据

参考