system-config/playbooks/zuul_reboot.yaml
Clark Boylan 4cbc5ee254 Perform package upgrades prior to zuul cluster node reboots
This serves two purposes. The first is that not all packages are updated
by unattended-upgrades beacuse it may not be safe to upgrade packages
while services are running. We should be safe in this situation because
we've gracefully stopped services and can proceed with package updates.
The other is unattended upgrades runs daily which means we could end up
almost 24 hours out of date prior to rebooting. This ensures we have the
latest and greatest packages installed just prior to rebooting.

Change-Id: Id351b5478e925ed1b4fbb6b3e27f2c0b6af8b897
2022-05-26 14:04:24 -07:00

110 lines
3.5 KiB
YAML

# TODO We need to add a locking/failsafe check mechanism
# TODO: stop pulling in the hourly job if we do this
- name: "Ensure we are going to restart/reboot on the same image"
import_playbook: zuul_pull.yaml
# TODO Do we want to force disabled servers to be rebooted too?
- hosts: "zuul-executor:!disabled"
name: "Reboot zuul-executors gracefully one at a time"
serial: 1
tasks:
- name: Gracefully stop the executor
include_role:
name: zuul-executor
tasks_from: graceful
- name: Upgrade executor server packages
apt:
update_cache: yes
upgrade: yes
- name: Reboot the executor server
reboot:
- name: Start the executor
include_role:
name: zuul-executor
tasks_from: start
- hosts: "zuul-merger:!disabled"
name: "Reboot zuul-mergers gracefully one at a time"
serial: 1
tasks:
- name: Gracefully stop the merger
include_role:
name: zuul-merger
tasks_from: graceful
- name: Upgrade merger server packages
apt:
update_cache: yes
upgrade: yes
- name: Reboot the merger server
reboot:
- name: Start the merger
include_role:
name: zuul-merger
tasks_from: start
# TODO should we do both schedulers with reboots then do the webs without
# reboots?
- hosts: "zuul-scheduler:!disabled"
name: "Reboot zuul-schedulers gracefully one at a time"
serial: 1
tasks:
- name: Stop the scheduler process
include_role:
name: zuul-scheduler
tasks_from: stop
- name: Stop the web processes
include_role:
name: zuul-web
tasks_from: stop
- name: Upgrade scheduler server packages
apt:
update_cache: yes
upgrade: yes
- name: Reboot the scheduler server
reboot:
- name: Start the scheduler process
include_role:
name: zuul-scheduler
tasks_from: start
- name: Start the web processes
include_role:
name: zuul-web
tasks_from: start
- name: Wait for scheduler to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 3 hours
retries: 360
delay: 30
until: "{{ components.status == 200 and components.content | from_json | json_query(scheduler_query) | length == 1 and components.content | from_json | json_query(scheduler_query) | first == 'running' }}"
vars:
scheduler_query: "scheduler[?hostname=='{{ inventory_hostname }}'].state"
- name: Wait for web to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 3 hours
retries: 360
delay: 30
until: "{{ components.status == 200 and components.content | from_json | json_query(web_query) | length == 1 and components.content | from_json | json_query(web_query) | first == 'running' }}"
vars:
web_query: "web[?hostname=='{{ inventory_hostname }}'].state"
- name: Wait for fingergw to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 45 minutes
retries: 180
delay: 15
until: "{{ components.status == 200 and components.content | from_json | json_query(finger_query) | length == 1 and components.content | from_json | json_query(finger_query) | first == 'running' }}"
vars:
finger_query: "fingergw[?hostname=='{{ inventory_hostname }}'].state"