Ansible 实操指南

学习 Ansible 实操指南 相关知识

Ansible 实操指南

学习目标

掌握 Ansible 核心模块、Playbook 编写、Inventory 管理、Roles 组织和故障排查,能从零编写生产级自动化脚本。


1. Ansible 核心理念

Ansible 三大特点:
1. 无 Agent — SSH 连接即用,无需在被控端装客户端
2. 幂等性 — 多次执行结果相同,不会重复操作
3. 声明式 — 描述期望状态,Ansible 负责达到

与 Terraform 的分工

维度TerraformAnsible
用途基础设施创建(VM、网络、LB)配置管理(装软件、改配置)
状态有 State,追踪资源生命周期无状态,执行即忘
执行方向声明式(desired state)声明式+命令式混合
协作模式Terraform provision VM → Ansible configure

2. 快速上手

2.1 Inventory(主机清单)

# inventory/hosts.ini
[webservers]
web-01 ansible_host=10.0.0.1 ansible_user=ubuntu
web-02 ansible_host=10.0.0.2 ansible_user=ubuntu

[dbservers]
db-01 ansible_host=10.0.0.10 ansible_user=ubuntu

[prod:children]
webservers
dbservers

[prod:vars]
ansible_python_interpreter=/usr/bin/python3
ansible_ssh_private_key_file=~/.ssh/prod_key
# 测试连接
ansible all -i inventory/hosts.ini -m ping

# 临时命令(ad-hoc)
ansible webservers -i inventory/hosts.ini -m shell -a "uptime"
ansible dbservers -i inventory/hosts.ini -m apt -a "name=nginx state=present" --become

2.2 项目结构

ansible-project/
├── ansible.cfg              # 全局配置
├── inventory/
│   ├── hosts.ini            # 主机清单
│   └── group_vars/
│       ├── all.yml          # 全局变量
│       ├── webservers.yml   # 分组变量
│       └── prod/
│           └── vault.yml    # 加密变量
├── playbooks/
│   ├── site.yml             # 主入口 Playbook
│   ├── webserver.yml
│   └── database.yml
└── roles/
    ├── common/              # 公共角色
    ├── nginx/
    └── postgresql/

3. Playbook 核心写法

3.1 基础 Playbook

# playbooks/webserver.yml
- name: Configure web servers
  hosts: webservers
  become: yes                    # sudo
  
  vars:
    nginx_port: 80
    app_version: "1.2.3"
  
  tasks:
    - name: Update apt cache
      apt:
        update_cache: yes
        cache_valid_time: 3600   # 1小时内不重复更新
    
    - name: Install nginx
      apt:
        name: nginx
        state: present
    
    - name: Copy nginx config
      template:
        src: templates/nginx.conf.j2
        dest: /etc/nginx/nginx.conf
      notify: reload nginx       # 触发 handler
    
    - name: Start nginx
      service:
        name: nginx
        state: started
        enabled: yes
  
  handlers:
    - name: reload nginx
      service:
        name: nginx
        state: reloaded

3.2 条件与循环

# conditions.yml
- name: Install monitoring agent (prod only)
  apt:
    name: datadog-agent
    state: present
  when: inventory_hostname in groups['prod']
  
- name: Create multiple users
  user:
    name: "{{ item.name }}"
    groups: "{{ item.groups | default('') }}"
    state: present
  loop:
    - { name: "deploy", groups: "docker" }
    - { name: "monitor", groups: "docker,sudo" }
    - { name: "backup" }
  
- name: Open firewall ports
  ufw:
    rule: allow
    port: "{{ item }}"
    proto: tcp
  loop: "{{ firewall_ports }}"
  when: firewall_ports is defined

3.3 批量服务器运维实战

# playbooks/patch-all.yml — 全量安全更新
- name: Security patch all servers
  hosts: all
  serial: "30%"             # 分批执行,每批 30%
  become: yes
  
  tasks:
    - name: Check disk space before patching
      shell: df -h / | awk 'NR==2 {print $5}' | tr -d '%'
      register: disk_usage
      failed_when: disk_usage.stdout | int > 90
    
    - name: Update packages
      apt:
        upgrade: safe
        update_cache: yes
      register: patch_result
    
    - name: Check if reboot required
      stat:
        path: /var/run/reboot-required
      register: reboot_required
    
    - name: Reboot if needed
      reboot:
        reboot_timeout: 300
      when: reboot_required.stat.exists

4. Jinja2 模板引擎

{# templates/nginx.conf.j2 #}
user www-data;
worker_processes {{ ansible_processor_vcpus }};
worker_rlimit_nofile {{ nginx_worker_rlimit_nofile | default(65535) }};

events {
    worker_connections {{ nginx_worker_connections | default(4096) }};
}

http {
    upstream backend {
        {% for host in groups['webservers'] %}
        server {{ hostvars[host].ansible_host }}:{{ app_port | default(8080) }};
        {% endfor %}
    }

    server {
        listen {{ nginx_port }};
        server_name {{ server_name }};
        
        location / {
            proxy_pass http://backend;
            proxy_set_header Host $host;
        }
        
        {% if enable_ssl | default(false) %}
        listen 443 ssl;
        ssl_certificate /etc/ssl/{{ server_name }}.crt;
        ssl_certificate_key /etc/ssl/{{ server_name }}.key;
        {% endif %}
    }
}

Jinja2 常用技巧

# 变量默认值
{{ some_var | default('fallback') }}

# 条件三元
{{ 'prod' if env == 'production' else 'dev' }}

# 列表过滤
{{ groups['all'] | reject('search', 'backup') | list }}

# IP 地址过滤
{{ ansible_default_ipv4.address }}/{{ ansible_default_ipv4.netmask }}

5. Ansible Vault — 密钥管理

# 加密敏感文件
ansible-vault create inventory/group_vars/prod/vault.yml
# 输入密码后编辑:
# db_password: "SuperSecret123"
# api_key: "sk-abc123"

# 加密已有文件
ansible-vault encrypt inventory/group_vars/prod/secrets.yml

# 运行时解密
ansible-playbook site.yml --ask-vault-pass
ansible-playbook site.yml --vault-password-file ~/.vault_pass

# 编辑加密文件
ansible-vault edit inventory/group_vars/prod/vault.yml
# 在 Playbook 中使用加密变量
- name: Deploy app with secrets
  hosts: app
  vars_files:
    - inventory/group_vars/prod/vault.yml
  
  tasks:
    - name: Set database password
      shell: |
        export DB_PASSWORD="{{ db_password }}"
        ./deploy.sh
      no_log: true  # 防止密码在日志中输出

6. 常用运维场景速查

6.1 批量创建用户 + SSH Key

- name: Add operators
  hosts: all
  become: yes
  vars:
    users:
      - { name: "zhangsan", key: "ssh-rsa AAAAB3... zhangsan@work" }
      - { name: "lisi",     key: "ssh-rsa AAAAB4... lisi@work" }
  
  tasks:
    - name: Create user
      user:
        name: "{{ item.name }}"
        groups: sudo
        shell: /bin/bash
        state: present
      loop: "{{ users }}"
    
    - name: Add SSH authorized key
      authorized_key:
        user: "{{ item.name }}"
        key: "{{ item.key }}"
        state: present
      loop: "{{ users }}"

6.2 批量安装 Docker

- name: Install Docker on all servers
  hosts: all
  become: yes
  
  tasks:
    - name: Install prerequisites
      apt:
        name:
          - ca-certificates
          - curl
          - gnupg
        state: present
    
    - name: Add Docker GPG key
      apt_key:
        url: https://download.docker.com/linux/ubuntu/gpg
        state: present
    
    - name: Add Docker repository
      apt_repository:
        repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
        state: present
    
    - name: Install Docker
      apt:
        name:
          - docker-ce
          - docker-compose-plugin
        state: present
    
    - name: Add users to docker group
      user:
        name: "{{ ansible_user }}"
        groups: docker
        append: yes

6.3 Nginx 零停机部署

- name: Zero-downtime deploy
  hosts: webservers
  serial: 1               # 一次只操作一台
  become: yes
  
  pre_tasks:
    - name: Drain from load balancer
      uri:
        url: "http://lb.internal:9090/drain/{{ inventory_hostname }}"
        method: POST
  
  tasks:
    - name: Deploy new version
      copy:
        src: "files/app-{{ app_version }}.jar"
        dest: /opt/app/app.jar
    
    - name: Restart app
      systemd:
        name: app
        state: restarted
    
    - name: Wait for health check
      uri:
        url: "http://localhost:8080/health"
        status_code: 200
      register: health
      until: health.status == 200
      retries: 30
      delay: 2
  
  post_tasks:
    - name: Re-add to load balancer
      uri:
        url: "http://lb.internal:9090/undrain/{{ inventory_hostname }}"
        method: POST

7. 故障排查速查

症状排查命令说明
连接超时ansible all -m ping -vvvv-vvvv 显示完整 SSH 连接过程
Playbook 语法错误ansible-playbook site.yml --syntax-check不执行,只检查
不知道会改什么ansible-playbook site.yml --check --diffdry-run + diff
某台机器跳过执行--limit 指定主机ansible-playbook site.yml --limit web-01
某 task 总是 changed检查幂等性查看 handler 是否被正确 notify
Jinja2 变量未定义`{{ vardefault(”) }}`
sudo 权限不够--become + 检查 sudoers确认用户有 NOPASSWD

8. 生产实践 checklist

# ansible.cfg — 生产推荐配置
[defaults]
host_key_checking = False       # 首次连接不提示确认
retry_files_enabled = False     # 不生成 .retry 文件
gathering = smart               # 缓存 facts,加速执行
forks = 20                      # 并发数
timeout = 30
stdout_callback = yaml          # 更可读的输出格式

[ssh_connection]
pipelining = True               # 减少 SSH 连接数(需确认 sudo requiretty 关闭)
control_path = /tmp/ansible-%%h-%%p-%%r