Ansible 实操指南
学习 Ansible 实操指南 相关知识
Ansible 实操指南
学习目标
掌握 Ansible 核心模块、Playbook 编写、Inventory 管理、Roles 组织和故障排查,能从零编写生产级自动化脚本。
1. Ansible 核心理念
Ansible 三大特点:
1. 无 Agent — SSH 连接即用,无需在被控端装客户端
2. 幂等性 — 多次执行结果相同,不会重复操作
3. 声明式 — 描述期望状态,Ansible 负责达到
与 Terraform 的分工
| 维度 | Terraform | Ansible |
|---|---|---|
| 用途 | 基础设施创建(VM、网络、LB) | 配置管理(装软件、改配置) |
| 状态 | 有 State,追踪资源生命周期 | 无状态,执行即忘 |
| 执行方向 | 声明式(desired state) | 声明式+命令式混合 |
| 协作模式 | Terraform provision VM → Ansible configure |
2. 快速上手
2.1 Inventory(主机清单)
# inventory/hosts.ini
[webservers]
web-01 ansible_host=10.0.0.1 ansible_user=ubuntu
web-02 ansible_host=10.0.0.2 ansible_user=ubuntu
[dbservers]
db-01 ansible_host=10.0.0.10 ansible_user=ubuntu
[prod:children]
webservers
dbservers
[prod:vars]
ansible_python_interpreter=/usr/bin/python3
ansible_ssh_private_key_file=~/.ssh/prod_key
# 测试连接
ansible all -i inventory/hosts.ini -m ping
# 临时命令(ad-hoc)
ansible webservers -i inventory/hosts.ini -m shell -a "uptime"
ansible dbservers -i inventory/hosts.ini -m apt -a "name=nginx state=present" --become
2.2 项目结构
ansible-project/
├── ansible.cfg # 全局配置
├── inventory/
│ ├── hosts.ini # 主机清单
│ └── group_vars/
│ ├── all.yml # 全局变量
│ ├── webservers.yml # 分组变量
│ └── prod/
│ └── vault.yml # 加密变量
├── playbooks/
│ ├── site.yml # 主入口 Playbook
│ ├── webserver.yml
│ └── database.yml
└── roles/
├── common/ # 公共角色
├── nginx/
└── postgresql/
3. Playbook 核心写法
3.1 基础 Playbook
# playbooks/webserver.yml
- name: Configure web servers
hosts: webservers
become: yes # sudo
vars:
nginx_port: 80
app_version: "1.2.3"
tasks:
- name: Update apt cache
apt:
update_cache: yes
cache_valid_time: 3600 # 1小时内不重复更新
- name: Install nginx
apt:
name: nginx
state: present
- name: Copy nginx config
template:
src: templates/nginx.conf.j2
dest: /etc/nginx/nginx.conf
notify: reload nginx # 触发 handler
- name: Start nginx
service:
name: nginx
state: started
enabled: yes
handlers:
- name: reload nginx
service:
name: nginx
state: reloaded
3.2 条件与循环
# conditions.yml
- name: Install monitoring agent (prod only)
apt:
name: datadog-agent
state: present
when: inventory_hostname in groups['prod']
- name: Create multiple users
user:
name: "{{ item.name }}"
groups: "{{ item.groups | default('') }}"
state: present
loop:
- { name: "deploy", groups: "docker" }
- { name: "monitor", groups: "docker,sudo" }
- { name: "backup" }
- name: Open firewall ports
ufw:
rule: allow
port: "{{ item }}"
proto: tcp
loop: "{{ firewall_ports }}"
when: firewall_ports is defined
3.3 批量服务器运维实战
# playbooks/patch-all.yml — 全量安全更新
- name: Security patch all servers
hosts: all
serial: "30%" # 分批执行,每批 30%
become: yes
tasks:
- name: Check disk space before patching
shell: df -h / | awk 'NR==2 {print $5}' | tr -d '%'
register: disk_usage
failed_when: disk_usage.stdout | int > 90
- name: Update packages
apt:
upgrade: safe
update_cache: yes
register: patch_result
- name: Check if reboot required
stat:
path: /var/run/reboot-required
register: reboot_required
- name: Reboot if needed
reboot:
reboot_timeout: 300
when: reboot_required.stat.exists
4. Jinja2 模板引擎
{# templates/nginx.conf.j2 #}
user www-data;
worker_processes {{ ansible_processor_vcpus }};
worker_rlimit_nofile {{ nginx_worker_rlimit_nofile | default(65535) }};
events {
worker_connections {{ nginx_worker_connections | default(4096) }};
}
http {
upstream backend {
{% for host in groups['webservers'] %}
server {{ hostvars[host].ansible_host }}:{{ app_port | default(8080) }};
{% endfor %}
}
server {
listen {{ nginx_port }};
server_name {{ server_name }};
location / {
proxy_pass http://backend;
proxy_set_header Host $host;
}
{% if enable_ssl | default(false) %}
listen 443 ssl;
ssl_certificate /etc/ssl/{{ server_name }}.crt;
ssl_certificate_key /etc/ssl/{{ server_name }}.key;
{% endif %}
}
}
Jinja2 常用技巧
# 变量默认值
{{ some_var | default('fallback') }}
# 条件三元
{{ 'prod' if env == 'production' else 'dev' }}
# 列表过滤
{{ groups['all'] | reject('search', 'backup') | list }}
# IP 地址过滤
{{ ansible_default_ipv4.address }}/{{ ansible_default_ipv4.netmask }}
5. Ansible Vault — 密钥管理
# 加密敏感文件
ansible-vault create inventory/group_vars/prod/vault.yml
# 输入密码后编辑:
# db_password: "SuperSecret123"
# api_key: "sk-abc123"
# 加密已有文件
ansible-vault encrypt inventory/group_vars/prod/secrets.yml
# 运行时解密
ansible-playbook site.yml --ask-vault-pass
ansible-playbook site.yml --vault-password-file ~/.vault_pass
# 编辑加密文件
ansible-vault edit inventory/group_vars/prod/vault.yml
# 在 Playbook 中使用加密变量
- name: Deploy app with secrets
hosts: app
vars_files:
- inventory/group_vars/prod/vault.yml
tasks:
- name: Set database password
shell: |
export DB_PASSWORD="{{ db_password }}"
./deploy.sh
no_log: true # 防止密码在日志中输出
6. 常用运维场景速查
6.1 批量创建用户 + SSH Key
- name: Add operators
hosts: all
become: yes
vars:
users:
- { name: "zhangsan", key: "ssh-rsa AAAAB3... zhangsan@work" }
- { name: "lisi", key: "ssh-rsa AAAAB4... lisi@work" }
tasks:
- name: Create user
user:
name: "{{ item.name }}"
groups: sudo
shell: /bin/bash
state: present
loop: "{{ users }}"
- name: Add SSH authorized key
authorized_key:
user: "{{ item.name }}"
key: "{{ item.key }}"
state: present
loop: "{{ users }}"
6.2 批量安装 Docker
- name: Install Docker on all servers
hosts: all
become: yes
tasks:
- name: Install prerequisites
apt:
name:
- ca-certificates
- curl
- gnupg
state: present
- name: Add Docker GPG key
apt_key:
url: https://download.docker.com/linux/ubuntu/gpg
state: present
- name: Add Docker repository
apt_repository:
repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
state: present
- name: Install Docker
apt:
name:
- docker-ce
- docker-compose-plugin
state: present
- name: Add users to docker group
user:
name: "{{ ansible_user }}"
groups: docker
append: yes
6.3 Nginx 零停机部署
- name: Zero-downtime deploy
hosts: webservers
serial: 1 # 一次只操作一台
become: yes
pre_tasks:
- name: Drain from load balancer
uri:
url: "http://lb.internal:9090/drain/{{ inventory_hostname }}"
method: POST
tasks:
- name: Deploy new version
copy:
src: "files/app-{{ app_version }}.jar"
dest: /opt/app/app.jar
- name: Restart app
systemd:
name: app
state: restarted
- name: Wait for health check
uri:
url: "http://localhost:8080/health"
status_code: 200
register: health
until: health.status == 200
retries: 30
delay: 2
post_tasks:
- name: Re-add to load balancer
uri:
url: "http://lb.internal:9090/undrain/{{ inventory_hostname }}"
method: POST
7. 故障排查速查
| 症状 | 排查命令 | 说明 |
|---|---|---|
| 连接超时 | ansible all -m ping -vvvv | -vvvv 显示完整 SSH 连接过程 |
| Playbook 语法错误 | ansible-playbook site.yml --syntax-check | 不执行,只检查 |
| 不知道会改什么 | ansible-playbook site.yml --check --diff | dry-run + diff |
| 某台机器跳过执行 | --limit 指定主机 | ansible-playbook site.yml --limit web-01 |
| 某 task 总是 changed | 检查幂等性 | 查看 handler 是否被正确 notify |
| Jinja2 变量未定义 | `{{ var | default(”) }}` |
| sudo 权限不够 | --become + 检查 sudoers | 确认用户有 NOPASSWD |
8. 生产实践 checklist
# ansible.cfg — 生产推荐配置
[defaults]
host_key_checking = False # 首次连接不提示确认
retry_files_enabled = False # 不生成 .retry 文件
gathering = smart # 缓存 facts,加速执行
forks = 20 # 并发数
timeout = 30
stdout_callback = yaml # 更可读的输出格式
[ssh_connection]
pipelining = True # 减少 SSH 连接数(需确认 sudo requiretty 关闭)
control_path = /tmp/ansible-%%h-%%p-%%r