tripleo-ansible/playbooks/update_cloud.yml
Julia Kreger e3f150bf54 ANSUPDATE-129 Patch SSH config for timeouts
Adding execution of a script to add configuration to the local system
SSH client configuration file that engages a heartbeat mechanism that
allows SSH to know that the server is still alive instead of relying
on inactivity to declare the SSH connection as failed.  These SSH
timeout issues present themselves as Broken Pipe errors.

Long running scripts, such as the database creation/upgrade scripts
can cause the SSH connection to go idle for a period of up to several
minutes with no output.

Change-Id: I1dc1b4f51cf20c1b450fb5879c0a46774384ac92
2014-11-18 08:45:49 -05:00

479 lines
20 KiB
YAML

# Copyright (c) 2014 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
- include: step_ping.yml
- hosts: localhost
name: "Setup local environment for upgrade processes to run"
gather_facts: no
max_fail_percentage: 0
tasks:
- include: update_local_ssh_config.yml
- hosts: undercloud
name: Disable Undercloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- command: mv -f /etc/init/mysql.conf /etc/init/mysql-boot-control.conf removes=/etc/init/mysql.conf
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_undercloud_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: undercloud_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- hosts: nova-compute
name: Disable Overcloud Compute
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- include: stop_vms.yml
when: instance_status == "ACTIVE"
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_compute_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_compute_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- hosts: swift-storage
name: swift-storage
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_swift_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_swift_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- hosts: vsa
name: "Stop services on VSA"
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_vsa_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- hosts: controller
name: Disable Overcloud Controller
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_controller_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_controller_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- include: stop_tgt.yml
when: instance_status == "ACTIVE"
- hosts: controllerMgmt
name: Disable Overcloud Controller Mgmt node
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloudmgmt_controller_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- include: stop_tgt.yml
when: instance_status == "ACTIVE"
# Critically, we need to select a single node of the galera cluster to
# be the 'last'. So controllerMgmt fits that bill for now. We will have
# to select one to be the "special" node eventually, we can do that with
# host facts and conditionals. The last to go down must have the
# Galera bootstrap run on it, or none of them will come up.
- hosts: controller
name: Stop MySQL/RabbitMQ on controller nodes
sudo: yes
gather_facts: no
max_fail_percentage: 0
serial: 1
tasks:
- include: galera_status.yml
when: instance_status == "ACTIVE"
- name: Stop MySQL under normal circumstances
service: name=mysql enabled=no state=stopped
when: instance_status == "ACTIVE" and galera_status == "Synced" and wsrep_cluster_size.stdout != "1"
- name: Stop MySQL if last node in cluster and single_controller flag has been set.
service: name=mysql enabled=no state=stopped
when: instance_status == "ACTIVE" and single_controller is defined and galera_status == "Synced" and wsrep_cluster_size.stdout == "1"
- fail: msg="Galera Replication is out of sync - cannot safely proceed"
when: single_controller is not defined and instance_status == "ACTIVE" and galera_status == "Out of Sync"
- fail: msg="Galera Replication - Node appears to be the last node in a cluster - cannot safely proceed unless overriden via single_controller setting - See README.rst"
when: instance_status == "ACTIVE" and single_controller is not defined and wsrep_cluster_size.stdout == "1"
- service: name=rabbitmq-server state=stopped
when: instance_status == "ACTIVE"
ignore_errors: yes
- command: rabbitmqctl -n "rabbit@$(hostname)" stop
when: instance_status == "ACTIVE"
ignore_errors: yes
- name: "Waiting for MySQL to stop"
wait_for: port=3307 state=stopped timeout=60 delay=10
when: instance_status == "ACTIVE" and helion is defined and single_controller is not defined and galera_status == 'Synced'
- name: "Waiting for rabbitmq-server to stop"
wait_for: port=5672 state=stopped timeout=60 delay=10
when: instance_status == "ACTIVE"
- hosts: controllerMgmt
name: Stop MySQL/RabbitMQ on Overcloud Controller Mgmt node
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- include: galera_status.yml
when: instance_status == "ACTIVE"
- fail: msg="Galera Replication on controller Management is out of sync - cannot safely proceed"
when: instance_status == "ACTIVE" and single_controller is not defined and galera_status != "Synced"
- fail: msg="Galera Replication on controller Management - cannot safely proceed as another MySQL cluster node is active."
when: instance_status == "ACTIVE" and single_controller is not defined and wsrep_cluster_size.stdout != "1"
- service: name=mysql enabled=no state=stopped
when: instance_status == "ACTIVE"
- service: name=rabbitmq-server enabled=no state=stopped
when: instance_status == "ACTIVE"
ignore_errors: yes
- command: rabbitmqctl -n "rabbit@$(hostname)" stop
when: instance_status == "ACTIVE"
ignore_errors: yes
- name: "Waiting for rabbitmq-server to stop"
wait_for: port=5672 state=stopped timeout=60 delay=10
when: instance_status == "ACTIVE"
- hosts: all
gather_facts: no
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
when: instance_status == "ACTIVE"
- hosts: undercloud
name: Rebuild and Refresh Undercloud
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: step_undercloud_backup_tftpboot.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ undercloud_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: undercloud
name: Enable Undercloud
sudo: yes
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- service_facts:
when: instance_status == "ACTIVE"
- include: stop_mysql.yml
- include: step_reset_mnt_state_permissions.yml
# Directly call os-apply-config to write out configuration files.
- include: step_os-apply-config.yml
- include: step_undercloud_restore_tftpboot.yml
- include: start_mysql.yml
- include: start_rabbitmq.yml
# Fix Ironic Reservations due to bug:
# https://bugs.launchpad.net/ironic/+bug/1382698
- include: step_undercloud_ironic_release_reservations.yml
- name: "Run os-collect-config"
command: os-collect-config --force --one
- service: name=os-collect-config state=started
- service: name={{ item }} enabled=yes state=started
with_items: helion_undercloud_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: undercloud_services
when: helion is not defined and item in existing_services
- hosts: controllerMgmt
name: Rebuild and Refresh ControllerMgmt
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: cleanup_cinder_volumes.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ controllermgmt_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- pause: seconds=30 prompt="Allowing controllerMgmt node to settle"
- hosts: controllerMgmt
name: Start initial cluster node
max_fail_percentage: 0
sudo: yes
tasks:
- include: mysql_init_fix.yml
- include: stop_mysql.yml
- include: rabbitmq_occ_disable.yml
- include: refresh_config.yml
- name: Stop os-collect-config to avoid collission
service: name=os-collect-config state=stopped
- name: "Work around apache2 starting up at boot w/o config..."
service: name=apache2 enabled=no state=stopped
- name: "Remove os-collect-config disable sentinel file"
file: path=/mnt/state/disable-os-collect-config state=absent
- name: "Run os-collect-config"
command: os-collect-config --force --one
- include: step_reset_mnt_state_permissions.yml
# Directly call os-apply-config to write out configuration files.
- include: step_os-apply-config.yml
- name: Bootstrap the MySQL cluster
command: /etc/init.d/mysql bootstrap-pxc
when: single_controller is not defined
- include: start_mysql.yml
- include: step_create_databases.yml
- include: start_rabbitmq.yml
- name: "Run os-collect-config"
command: os-collect-config --force --one
- name: Wait for Rabbit to listen on its usual port
wait_for: port=5672 state=started timeout=90 delay=10
- name: Restart os-collect-config
service: name=os-collect-config state=started
- hosts: controller
name: Rebuild and Refresh Controller
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: cleanup_cinder_volumes.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ controller_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- pause: seconds=30 prompt="Allowing controller node to settle."
- hosts: controller
name: Stop and setup for controller refresh
max_fail_percentage: 0
sudo: yes
tasks:
# This action of stopping prior to starting is to ensure that should
# MySQL started upon boot, then it would hopefully pickup new config
# that os-collect-config and os-apply-config would have put in place.
- include: mysql_init_fix.yml
- include: stop_mysql.yml
- include: rabbitmq_occ_disable.yml
- include: step_reset_mnt_state_permissions.yml
- include: refresh_config.yml
- name: Stop os-collect-config to avoid collission
service: name=os-collect-config state=stopped
- name: "Work around apache2 starting up at boot w/o config..."
service: name=apache2 enabled=no state=stopped
- name: "Remove os-collect-config disable sentinel file"
file: path=/mnt/state/disable-os-collect-config state=absent
- name: "Run os-collect-config"
command: os-collect-config --force --one
# Directly call os-apply-config to write out configuration files in case
# os-collect-config has failed to reach that step.
- include: step_os-apply-config.yml
- include: start_mysql.yml
- hosts: controller
name: Initiate Database Creation
max_fail_percentage: 0
serial: 1
sudo: yes
tasks:
- include: step_create_databases.yml
- hosts: controller
name: Complete Controller Refresh
max_fail_percentage: 0
sudo: yes
tasks:
- include: start_rabbitmq.yml
- name: Re-run os-collect-config in case first one failed due to a race condition
command: os-collect-config --noforce --one
- name: "Restart os-collect-config"
service: name=os-collect-config state=started
- name: Wait for Rabbit to listen on its usual port
wait_for: port=5672 state=started timeout=120 delay=10
- hosts: controller:controllerMgmt
name: Check RabbitMQ
max_fail_percentage: 0
tasks:
- pause: seconds=30 prompt="Giving RabbitMQ time to start-up."
- name: Checking rabbitmq cluster status
sudo: yes
command: rabbitmqctl cluster_status
when: single_controller is not defined
- include: cleanup_rabbitmq_start.yml
- hosts: controllerMgmt
name: Enable Overcloud ControllerMgmt
sudo: yes
max_fail_percentage: 0
tasks:
- service: name=os-collect-config state=started
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloudmgmt_controller_service
when: helion is defined and item in existing_services
- hosts: controller
name: Enable Overcloud Controller
sudo: yes
max_fail_percentage: 0
tasks:
- service: name=os-collect-config state=started
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_controller_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: overcloud_controller_services
when: helion is not defined and item in existing_services
- hosts: swift-storage
name: Rebuild and Refresh swift-storage
gather_facts: no
max_fail_percentage: 0
tasks:
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ swift_storage_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: swift-storage
name: Enable Swift Storage
sudo: yes
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- name: "Run os-collect-config"
sudo: yes
command: os-collect-config --force --one
- service: name=os-collect-config state=started
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_swift_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: overcloud_swift_services
when: helion is not defined and item in existing_services
- hosts: vsa
name: Rebuild and Refresh vsa
gather_facts: no
max_fail_percentage: 0
tasks:
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ vsa_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: vsa
name: Enable VSA
sudo: yes
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- name: "Run os-collect-config"
sudo: yes
command: os-collect-config --force --one
- service: name=os-collect-config state=started
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_vsa_services
when: helion is defined and item in existing_services
- hosts: nova-compute
name: Rebuild and Refresh Nova Compute
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ nova_compute_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: nova-compute
name: Enable Overcloud Compute
sudo: yes
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- name: "Run os-collect-config"
sudo: yes
command: os-collect-config --force --one
- service: name=os-collect-config state=started
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_compute_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: overcloud_compute_services
when: helion is not defined and item in existing_services