system-config/playbooks/zuul_reboot.yaml
Clark Boylan 03987a8009 Run zuul cluster reboots and updates automatically
This adds a weekly cronjob that will reboot and update our entire zuul
cluster gracefully. The time frame chosen for this should be after North
America begins its weekend and before Europe starts their week. The idea
is that we're doing this during the quiet time of our week.

Change-Id: Ib9a54f273e11744fa1ddbf367c291289f86bddb7
2022-06-21 16:00:42 -07:00

111 lines
3.6 KiB
YAML

# This relies on flock -n /var/run/ansible/zuul_reboot.lock to ensure
# we don't run multiple copies of this playbook concurrently.
# TODO: stop pulling in the hourly job if we do this
- name: "Ensure we are going to restart/reboot on the same image"
import_playbook: zuul_pull.yaml
# TODO Do we want to force disabled servers to be rebooted too?
- hosts: "zuul-executor:!disabled"
name: "Reboot zuul-executors gracefully one at a time"
serial: 1
tasks:
- name: Gracefully stop the executor
include_role:
name: zuul-executor
tasks_from: graceful
- name: Upgrade executor server packages
apt:
update_cache: yes
upgrade: yes
- name: Reboot the executor server
reboot:
- name: Start the executor
include_role:
name: zuul-executor
tasks_from: start
- hosts: "zuul-merger:!disabled"
name: "Reboot zuul-mergers gracefully one at a time"
serial: 1
tasks:
- name: Gracefully stop the merger
include_role:
name: zuul-merger
tasks_from: graceful
- name: Upgrade merger server packages
apt:
update_cache: yes
upgrade: yes
- name: Reboot the merger server
reboot:
- name: Start the merger
include_role:
name: zuul-merger
tasks_from: start
# TODO should we do both schedulers with reboots then do the webs without
# reboots?
- hosts: "zuul-scheduler:!disabled"
name: "Reboot zuul-schedulers gracefully one at a time"
serial: 1
tasks:
- name: Stop the scheduler process
include_role:
name: zuul-scheduler
tasks_from: stop
- name: Stop the web processes
include_role:
name: zuul-web
tasks_from: stop
- name: Upgrade scheduler server packages
apt:
update_cache: yes
upgrade: yes
- name: Reboot the scheduler server
reboot:
- name: Start the scheduler process
include_role:
name: zuul-scheduler
tasks_from: start
- name: Start the web processes
include_role:
name: zuul-web
tasks_from: start
- name: Wait for scheduler to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 3 hours
retries: 360
delay: 30
until: "{{ components.status == 200 and components.content | from_json | json_query(scheduler_query) | length == 1 and components.content | from_json | json_query(scheduler_query) | first == 'running' }}"
vars:
scheduler_query: "scheduler[?hostname=='{{ inventory_hostname }}'].state"
- name: Wait for web to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 3 hours
retries: 360
delay: 30
until: "{{ components.status == 200 and components.content | from_json | json_query(web_query) | length == 1 and components.content | from_json | json_query(web_query) | first == 'running' }}"
vars:
web_query: "web[?hostname=='{{ inventory_hostname }}'].state"
- name: Wait for fingergw to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 45 minutes
retries: 180
delay: 15
until: "{{ components.status == 200 and components.content | from_json | json_query(finger_query) | length == 1 and components.content | from_json | json_query(finger_query) | first == 'running' }}"
vars:
finger_query: "fingergw[?hostname=='{{ inventory_hostname }}'].state"