diff --git a/playbooks/zuul_reboot.yaml b/playbooks/zuul_reboot.yaml new file mode 100644 index 0000000000..1fd763c9a2 --- /dev/null +++ b/playbooks/zuul_reboot.yaml @@ -0,0 +1,97 @@ +# TODO We need to add a locking/failsafe check mechanism + +# TODO: stop pulling in the hourly job if we do this +- name: "Ensure we are going to restart/reboot on the same image" + import_playbook: zuul_pull.yaml + +# TODO Do we want to force disabled servers to be rebooted too? +- hosts: "zuul-executor:!disabled" + name: "Reboot zuul-executors gracefully one at a time" + serial: 1 + tasks: + - name: Gracefully stop the executor + include_role: + name: zuul-executor + tasks_from: graceful + - name: Reboot the executor server + reboot: + - name: Start the executor + include_role: + name: zuul-executor + tasks_from: start + +- hosts: "zuul-merger:!disabled" + name: "Reboot zuul-mergers gracefully one at a time" + serial: 1 + tasks: + - name: Gracefully stop the merger + include_role: + name: zuul-merger + tasks_from: graceful + - name: Reboot the merger server + reboot: + - name: Start the merger + include_role: + name: zuul-merger + tasks_from: start + +# TODO should we do both schedulers with reboots then do the webs without +# reboots? +- hosts: "zuul-scheduler:!disabled" + name: "Reboot zuul-schedulers gracefully one at a time" + serial: 1 + tasks: + - name: Stop the scheduler process + include_role: + name: zuul-scheduler + tasks_from: stop + - name: Stop the web processes + include_role: + name: zuul-web + tasks_from: stop + - name: Reboot the scheduler server + reboot: + - name: Start the scheduler process + include_role: + name: zuul-scheduler + tasks_from: start + - name: Start the web processes + include_role: + name: zuul-web + tasks_from: start + - name: Wait for scheduler to be running + uri: + url: https://zuul.opendev.org/api/components + method: GET + return_content: yes + register: components + # 3 hours + retries: 360 + delay: 30 + until: "{{ components.status == 200 and components.content | from_json | json_query(scheduler_query) | length == 1 and components.content | from_json | json_query(scheduler_query) | first == 'running' }}" + vars: + scheduler_query: "scheduler[?hostname=='{{ inventory_hostname }}'].state" + - name: Wait for web to be running + uri: + url: https://zuul.opendev.org/api/components + method: GET + return_content: yes + register: components + # 3 hours + retries: 360 + delay: 30 + until: "{{ components.status == 200 and components.content | from_json | json_query(web_query) | length == 1 and components.content | from_json | json_query(web_query) | first == 'running' }}" + vars: + web_query: "web[?hostname=='{{ inventory_hostname }}'].state" + - name: Wait for fingergw to be running + uri: + url: https://zuul.opendev.org/api/components + method: GET + return_content: yes + register: components + # 45 minutes + retries: 180 + delay: 15 + until: "{{ components.status == 200 and components.content | from_json | json_query(finger_query) | length == 1 and components.content | from_json | json_query(finger_query) | first == 'running' }}" + vars: + finger_query: "fingergw[?hostname=='{{ inventory_hostname }}'].state"