tripleo-ansible/playbooks/update_cloud.yml
Julia Kreger 68fcbca4c4 ANSUPDATE-126 Backup/Restore /tftpboot
Backup and restore tftpboot as ironic does not recreate files
necessary for overcloud nodes to boot.

Change-Id: Ibdc8b41be480f9344e0ba014bb0017591c603257
2014-10-29 09:11:59 -04:00

473 lines
20 KiB
YAML

# Copyright (c) 2014 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
- include: step_ping.yml
- hosts: undercloud
name: Disable Undercloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- command: mv -f /etc/init/mysql.conf /etc/init/mysql-boot-control.conf removes=/etc/init/mysql.conf
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_undercloud_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: undercloud_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- hosts: nova-compute
name: Disable Overcloud Compute
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- include: stop_vms.yml
when: instance_status == "ACTIVE"
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_compute_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_compute_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- hosts: swift-storage
name: swift-storage
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_swift_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_swift_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- hosts: vsa
name: "Stop services on VSA"
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_vsa_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- hosts: controller
name: Disable Overcloud Controller
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_controller_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_controller_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- include: stop_tgt.yml
when: instance_status == "ACTIVE"
- hosts: controllerMgmt
name: Disable Overcloud Controller Mgmt node
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloudmgmt_controller_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- include: stop_tgt.yml
when: instance_status == "ACTIVE"
# Critically, we need to select a single node of the galera cluster to
# be the 'last'. So controllerMgmt fits that bill for now. We will have
# to select one to be the "special" node eventually, we can do that with
# host facts and conditionals. The last to go down must have the
# Galera bootstrap run on it, or none of them will come up.
- hosts: controller
name: Stop MySQL/RabbitMQ on controller nodes
sudo: yes
gather_facts: no
max_fail_percentage: 0
serial: 1
tasks:
- include: galera_status.yml
when: instance_status == "ACTIVE"
- name: Stop MySQL under normal circumstances
service: name=mysql enabled=no state=stopped
when: instance_status == "ACTIVE" and galera_status == "Synced" and wsrep_cluster_size.stdout != "1"
- name: Stop MySQL if last node in cluster and single_controller flag has been set.
service: name=mysql enabled=no state=stopped
when: instance_status == "ACTIVE" and single_controller is defined and galera_status == "Synced" and wsrep_cluster_size.stdout == "1"
- fail: msg="Galera Replication is out of sync - cannot safely proceed"
when: single_controller is not defined and instance_status == "ACTIVE" and galera_status == "Out of Sync"
- fail: msg="Galera Replication - Node appears to be the last node in a cluster - cannot safely proceed unless overriden via single_controller setting - See README.rst"
when: instance_status == "ACTIVE" and single_controller is not defined and wsrep_cluster_size.stdout == "1"
- service: name=rabbitmq-server state=stopped
when: instance_status == "ACTIVE"
ignore_errors: yes
- command: rabbitmqctl -n "rabbit@$(hostname)" stop
when: instance_status == "ACTIVE"
ignore_errors: yes
- name: "Waiting for MySQL to stop"
wait_for: port=3307 state=stopped timeout=60 delay=10
when: instance_status == "ACTIVE" and helion is defined and single_controller is not defined and galera_status == 'Synced'
- name: "Waiting for rabbitmq-server to stop"
wait_for: port=5672 state=stopped timeout=60 delay=10
when: instance_status == "ACTIVE"
- hosts: controllerMgmt
name: Stop MySQL/RabbitMQ on Overcloud Controller Mgmt node
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- include: galera_status.yml
when: instance_status == "ACTIVE"
- fail: msg="Galera Replication on controller Management is out of sync - cannot safely proceed"
when: instance_status == "ACTIVE" and single_controller is not defined and galera_status != "Synced"
- fail: msg="Galera Replication on controller Management - cannot safely proceed as another MySQL cluster node is active."
when: instance_status == "ACTIVE" and single_controller is not defined and wsrep_cluster_size.stdout != "1"
- service: name=mysql enabled=no state=stopped
when: instance_status == "ACTIVE"
- service: name=rabbitmq-server enabled=no state=stopped
when: instance_status == "ACTIVE"
ignore_errors: yes
- command: rabbitmqctl -n "rabbit@$(hostname)" stop
when: instance_status == "ACTIVE"
ignore_errors: yes
- name: "Waiting for rabbitmq-server to stop"
wait_for: port=5672 state=stopped timeout=60 delay=10
when: instance_status == "ACTIVE"
- hosts: all
gather_facts: no
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
when: instance_status == "ACTIVE"
- hosts: undercloud
name: Rebuild and Refresh Undercloud
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: step_undercloud_backup_tftpboot.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ undercloud_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: undercloud
name: Enable Undercloud
sudo: yes
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- service_facts:
when: instance_status == "ACTIVE"
- include: stop_mysql.yml
- include: step_reset_mnt_state_permissions.yml
# Directly call os-apply-config to write out configuration files.
- include: step_os-apply-config.yml
- include: step_undercloud_restore_tftpboot.yml
- include: start_mysql.yml
- include: start_rabbitmq.yml
# Fix Ironic Reservations due to bug:
# https://bugs.launchpad.net/ironic/+bug/1382698
- include: step_undercloud_ironic_release_reservations.yml
- name: "Run os-collect-config"
command: os-collect-config --force --one
- service: name=os-collect-config state=started
- service: name={{ item }} enabled=yes state=started
with_items: helion_undercloud_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: undercloud_services
when: helion is not defined and item in existing_services
- hosts: controllerMgmt
name: Rebuild and Refresh ControllerMgmt
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: cleanup_cinder_volumes.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ controllermgmt_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- pause: seconds=30 prompt="Allowing controllerMgmt node to settle"
- hosts: controllerMgmt
name: Start initial cluster node
max_fail_percentage: 0
sudo: yes
tasks:
- include: mysql_init_fix.yml
- include: stop_mysql.yml
- include: rabbitmq_occ_disable.yml
- include: refresh_config.yml
- name: Stop os-collect-config to avoid collission
service: name=os-collect-config state=stopped
- name: "Work around apache2 starting up at boot w/o config..."
service: name=apache2 enabled=no state=stopped
- name: "Remove os-collect-config disable sentinel file"
file: path=/mnt/state/disable-os-collect-config state=absent
- name: "Run os-collect-config"
command: os-collect-config --force --one
- include: step_reset_mnt_state_permissions.yml
# Directly call os-apply-config to write out configuration files.
- include: step_os-apply-config.yml
- name: Bootstrap the MySQL cluster
command: /etc/init.d/mysql bootstrap-pxc
when: single_controller is not defined
- include: start_mysql.yml
- include: step_create_databases.yml
- include: start_rabbitmq.yml
- name: "Run os-collect-config"
command: os-collect-config --force --one
- name: Wait for Rabbit to listen on its usual port
wait_for: port=5672 state=started timeout=90 delay=10
- name: Restart os-collect-config
service: name=os-collect-config state=started
- hosts: controller
name: Rebuild and Refresh Controller
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: cleanup_cinder_volumes.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ controller_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- pause: seconds=30 prompt="Allowing controller node to settle."
- hosts: controller
name: Stop and setup for controller refresh
max_fail_percentage: 0
sudo: yes
tasks:
# This action of stopping prior to starting is to ensure that should
# MySQL started upon boot, then it would hopefully pickup new config
# that os-collect-config and os-apply-config would have put in place.
- include: mysql_init_fix.yml
- include: stop_mysql.yml
- include: rabbitmq_occ_disable.yml
- include: step_reset_mnt_state_permissions.yml
- include: refresh_config.yml
- name: Stop os-collect-config to avoid collission
service: name=os-collect-config state=stopped
- name: "Work around apache2 starting up at boot w/o config..."
service: name=apache2 enabled=no state=stopped
- name: "Remove os-collect-config disable sentinel file"
file: path=/mnt/state/disable-os-collect-config state=absent
- name: "Run os-collect-config"
command: os-collect-config --force --one
# Directly call os-apply-config to write out configuration files in case
# os-collect-config has failed to reach that step.
- include: step_os-apply-config.yml
- include: start_mysql.yml
- hosts: controller
name: Initiate Database Creation
max_fail_percentage: 0
serial: 1
sudo: yes
tasks:
- include: step_create_databases.yml
- hosts: controller
name: Complete Controller Refresh
max_fail_percentage: 0
sudo: yes
tasks:
- include: start_rabbitmq.yml
- name: Re-run os-collect-config in case first one failed due to a race condition
command: os-collect-config --noforce --one
- name: "Restart os-collect-config"
service: name=os-collect-config state=started
- name: Wait for Rabbit to listen on its usual port
wait_for: port=5672 state=started timeout=120 delay=10
- hosts: controller:controllerMgmt
name: Check RabbitMQ
max_fail_percentage: 0
tasks:
- pause: seconds=30 prompt="Giving RabbitMQ time to start-up."
- name: Checking rabbitmq cluster status
sudo: yes
command: rabbitmqctl cluster_status
when: single_controller is not defined
- include: cleanup_rabbitmq_start.yml
- hosts: controllerMgmt
name: Enable Overcloud ControllerMgmt
sudo: yes
max_fail_percentage: 0
tasks:
- service: name=os-collect-config state=started
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloudmgmt_controller_service
when: helion is defined and item in existing_services
- hosts: controller
name: Enable Overcloud Controller
sudo: yes
max_fail_percentage: 0
tasks:
- service: name=os-collect-config state=started
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_controller_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: overcloud_controller_services
when: helion is not defined and item in existing_services
- hosts: swift-storage
name: Rebuild and Refresh swift-storage
gather_facts: no
max_fail_percentage: 0
tasks:
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ swift_storage_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: swift-storage
name: Enable Swift Storage
sudo: yes
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- name: "Run os-collect-config"
sudo: yes
command: os-collect-config --force --one
- service: name=os-collect-config state=started
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_swift_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: overcloud_swift_services
when: helion is not defined and item in existing_services
- hosts: vsa
name: Rebuild and Refresh vsa
gather_facts: no
max_fail_percentage: 0
tasks:
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ vsa_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: vsa
name: Enable VSA
sudo: yes
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- name: "Run os-collect-config"
sudo: yes
command: os-collect-config --force --one
- service: name=os-collect-config state=started
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_vsa_services
when: helion is defined and item in existing_services
- hosts: nova-compute
name: Rebuild and Refresh Nova Compute
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ nova_compute_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: nova-compute
name: Enable Overcloud Compute
sudo: yes
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- name: "Run os-collect-config"
sudo: yes
command: os-collect-config --force --one
- service: name=os-collect-config state=started
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_compute_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: overcloud_compute_services
when: helion is not defined and item in existing_services