From 88425158a122b4185f3837ade9583b8d6a8210d4 Mon Sep 17 00:00:00 2001 From: Clark Boylan Date: Wed, 25 May 2022 09:28:26 -0700 Subject: [PATCH] Add playbook to gracefully stop and reboot the zuul cluster This should simplify the process of applying patches to the cluster. Change-Id: I28756e32c2f42186e11d78e4ca461e808026f632 --- playbooks/zuul_reboot.yaml | 97 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 playbooks/zuul_reboot.yaml diff --git a/playbooks/zuul_reboot.yaml b/playbooks/zuul_reboot.yaml new file mode 100644 index 0000000000..1fd763c9a2 --- /dev/null +++ b/playbooks/zuul_reboot.yaml @@ -0,0 +1,97 @@ +# TODO We need to add a locking/failsafe check mechanism + +# TODO: stop pulling in the hourly job if we do this +- name: "Ensure we are going to restart/reboot on the same image" + import_playbook: zuul_pull.yaml + +# TODO Do we want to force disabled servers to be rebooted too? +- hosts: "zuul-executor:!disabled" + name: "Reboot zuul-executors gracefully one at a time" + serial: 1 + tasks: + - name: Gracefully stop the executor + include_role: + name: zuul-executor + tasks_from: graceful + - name: Reboot the executor server + reboot: + - name: Start the executor + include_role: + name: zuul-executor + tasks_from: start + +- hosts: "zuul-merger:!disabled" + name: "Reboot zuul-mergers gracefully one at a time" + serial: 1 + tasks: + - name: Gracefully stop the merger + include_role: + name: zuul-merger + tasks_from: graceful + - name: Reboot the merger server + reboot: + - name: Start the merger + include_role: + name: zuul-merger + tasks_from: start + +# TODO should we do both schedulers with reboots then do the webs without +# reboots? +- hosts: "zuul-scheduler:!disabled" + name: "Reboot zuul-schedulers gracefully one at a time" + serial: 1 + tasks: + - name: Stop the scheduler process + include_role: + name: zuul-scheduler + tasks_from: stop + - name: Stop the web processes + include_role: + name: zuul-web + tasks_from: stop + - name: Reboot the scheduler server + reboot: + - name: Start the scheduler process + include_role: + name: zuul-scheduler + tasks_from: start + - name: Start the web processes + include_role: + name: zuul-web + tasks_from: start + - name: Wait for scheduler to be running + uri: + url: https://zuul.opendev.org/api/components + method: GET + return_content: yes + register: components + # 3 hours + retries: 360 + delay: 30 + until: "{{ components.status == 200 and components.content | from_json | json_query(scheduler_query) | length == 1 and components.content | from_json | json_query(scheduler_query) | first == 'running' }}" + vars: + scheduler_query: "scheduler[?hostname=='{{ inventory_hostname }}'].state" + - name: Wait for web to be running + uri: + url: https://zuul.opendev.org/api/components + method: GET + return_content: yes + register: components + # 3 hours + retries: 360 + delay: 30 + until: "{{ components.status == 200 and components.content | from_json | json_query(web_query) | length == 1 and components.content | from_json | json_query(web_query) | first == 'running' }}" + vars: + web_query: "web[?hostname=='{{ inventory_hostname }}'].state" + - name: Wait for fingergw to be running + uri: + url: https://zuul.opendev.org/api/components + method: GET + return_content: yes + register: components + # 45 minutes + retries: 180 + delay: 15 + until: "{{ components.status == 200 and components.content | from_json | json_query(finger_query) | length == 1 and components.content | from_json | json_query(finger_query) | first == 'running' }}" + vars: + finger_query: "fingergw[?hostname=='{{ inventory_hostname }}'].state"