Improve multi-node AIO robustness

In order to improve the readability and robustness of the mnaio feature
I have replaced the shell out to virsh tasks to use the virt module
where available.  I have also created a vm-status play that will
hopefully help resolve SSH failures into the VMs.  This play utilizes
the block/rescue/handler pattern to attempt to restart the VM once if
it fails the initial SSH check.  Hopefully this will reduce the SSH
failures due to a suck VM.  This adds a new variable called
vm_ssh_timeout which allows the deployer an easy place to override the
default timeout.  The python-lxml package is needed for the virt module.

Change-Id: I027556b71a8c26d08a56b4ffa56b2eeaf1cbabe9
This commit is contained in:
Dave Wilde 2018-06-15 10:35:14 -05:00
parent d0b0668657
commit 482e845d92
7 changed files with 165 additions and 65 deletions

View File

@ -1,5 +1,5 @@
---
# Copyright 2017, Rackspace US, Inc.
# Copyright 2018, Rackspace US, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -32,7 +32,9 @@
- always
- name: Stop running VMs
command: "virsh destroy {{ hostvars[item]['server_hostname'] }}"
virt:
name: "{{ hostvars[item]['server_hostname'] }}"
command: destroy
failed_when: false
when:
- hostvars[item]['server_vm'] | default(false) | bool
@ -96,17 +98,6 @@
- hostvars[item]['server_vm'] | default(false) | bool
with_items: "{{ groups['pxe_servers'] }}"
- name: Create the VM template
template:
src: kvm/kvm-vm.xml
dest: "/etc/libvirt/qemu/{{ hostvars[item]['server_hostname'] }}.xml"
mode: 0644
owner: root
group: root
when:
- hostvars[item]['server_vm'] | default(false) | bool
with_items: "{{ groups['pxe_servers'] }}"
- name: Wait for guest capabilities to appear
command: "virsh capabilities"
register: virsh_caps
@ -115,21 +106,35 @@
delay: 10
- name: Define the VM
command: "virsh define /etc/libvirt/qemu/{{ hostvars[item]['server_hostname'] }}.xml"
virt:
name: "{{ hostvars[item]['server_hostname'] }}"
command: define
xml: "{{ lookup('template', 'kvm/kvm-vm.xml.j2') }}"
failed_when: false
when:
- hostvars[item]['server_vm'] | default(false) | bool
with_items: "{{ groups['pxe_servers'] }}"
- name: Create the VM
command: "virsh create /etc/libvirt/qemu/{{ hostvars[item]['server_hostname'] }}.xml"
failed_when: false
- name: Get the VM xml
virt:
command: get_xml
name: "{{ hostvars[item]['server_hostname'] }}"
register: vm_xml
when:
- hostvars[item]['server_vm'] | default(false) | bool
with_items: "{{ groups['pxe_servers'] }}"
- name: Write the VM xml
copy:
content: "{{ item.get_xml }}"
dest: "/etc/libvirt/qemu/{{ item.item }}.xml"
with_items: "{{ vm_xml.results }}"
- name: Start the VM
command: "virsh start {{ hostvars[item]['server_hostname'] }}"
virt:
name: "{{ hostvars[item]['server_hostname'] }}"
command: start
state: running
failed_when: false
when:
- hostvars[item]['server_vm'] | default(false) | bool
@ -144,6 +149,10 @@
with_items: "{{ groups['pxe_servers'] }}"
- name: Check VM Connectivity
import_playbook: vm-status.yml
- name: Create vm_servers group
hosts: localhost
gather_facts: false
@ -160,23 +169,12 @@
with_items: "{{ groups['pxe_servers'] }}"
- name: Wait for deploy host
- name: VM Host Setup
hosts: vm_servers
gather_facts: false
any_errors_fatal: true
tasks:
- name: Wait for connectivity 1
local_action:
module: wait_for
host: "{{ ansible_host }}"
connect_timeout: 10
port: 22
sleep: 20
timeout: 1500
state: started
search_regex: OpenSSH
- name: copy host keys
- name: Copy Host Keys
copy:
src: "{{ item.src }}"
dest: "{{ item.dest }}"

View File

@ -30,7 +30,10 @@ default_container_tech: "{{ container_tech | default('lxc') }}"
ipxe_kernel_url: "http://boot.ipxe.org/ipxe.lkrn"
# IP address, or domain name of the TFTP server
# The timeout for the SSH check to the vm_servers
vm_ssh_timeout: 1500
# IP address, or domain name of the TFTP server
tftp_server: "{{ hostvars[groups['pxe_hosts'][0]]['ansible_host'] | default(ansible_host) }}"
# tftp_ssh_key: '' # user defined ssh key, used to access the host
tftp_port: 69

View File

@ -1,5 +1,5 @@
---
# Copyright 2017, Rackspace US, Inc.
# Copyright 2018, Rackspace US, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -215,30 +215,32 @@
with_dict: "{{ mnaio_host_networks }}"
when: mnaio_bridges is changed
- name: Disable virsh default network
shell: |
if virsh net-list | grep -qw "default"; then
virsh net-autostart default --disable
virsh net-destroy default
fi
- name: Disable default virt network
virt_net:
name: "default"
state: inactive
autostart: no
- name: Drop virsh network configs
template:
src: "kvm/libvirt-network-template.xml"
dest: "/etc/libvirt/qemu/networks/{{ item.value.iface }}.xml"
mode: "0644"
owner: root
group: root
with_dict: "{{ mnaio_host_networks }}"
- name: List virt network(s)
virt_net:
command: list_nets
register: vm_networks
- name: Enable new virsh network(s)
shell: |
if ! virsh net-list | grep -qw "{{ item.value.iface }}"; then
virsh net-define --file /etc/libvirt/qemu/networks/{{ item.value.iface }}.xml
virsh net-create --file /etc/libvirt/qemu/networks/{{ item.value.iface }}.xml
virsh net-autostart {{ item.value.iface }} || ture
fi
- name: Define virt network(s)
virt_net:
command: define
name: "{{ item.value.iface }}"
xml: "{{ lookup('template', 'kvm/libvirt-network-template.xml.j2') }}"
with_dict: "{{ mnaio_host_networks }}"
when: "item.value.iface not in vm_networks.list_nets"
- name: Create virt network(s)
virt_net:
command: create
name: "{{ item.value.iface }}"
autostart: true
with_dict: "{{ mnaio_host_networks }}"
when: "item.value.iface not in vm_networks.list_nets"
- name: Locate data volume
command: "vgdisplay vg01"
@ -276,27 +278,47 @@
- default_vm_disk_mode | default('lvm') == "lvm"
- data_volume.rc != 0
- name: Locate virsh data volume
command: "virsh pool-info vg01"
- name: Locate virt data volume
virt_pool:
name: "vg01"
command: info
failed_when: false
when:
- default_vm_disk_mode | default('lvm') == "lvm"
register: virsh_data_volume
register: virt_data_volume
- name: Create /etc/libvirt/storage directory
file:
path: "/etc/libvirt/storage/"
state: "directory"
- name: Create virsh data volume
shell: |
virsh pool-create-as vg01 logical
virsh pool-dumpxml vg01 > /etc/libvirt/storage/vg01.xml
virsh pool-define /etc/libvirt/storage/vg01.xml
virsh pool-autostart vg01 || true
- name: Create virt data volume
block:
- name: Create virt pool
virt_pool:
command: create
name: vg01
- name: Get virt pool xml
virt_pool:
command: get_xml
name: vg01
register: virt_pool_xml
- name: Write data volume xml
copy:
content: "{{ virt_pool_xml.get_xml }}"
dest: "/etc/libvirt/storage/vg01.xml"
- name: Define virt data volume
virt_pool:
command: define
name: vg01
xml: "/etc/libvirt/storage/vg01.xml"
autostart: true
when:
- default_vm_disk_mode | default('lvm') == "lvm"
- virsh_data_volume.rc != 0
- virt_data_volume.pools is not defined
- name: Load virtio kernel modules
shell: |

View File

@ -31,6 +31,7 @@ mnaio_host_distro_packages:
- ntp
- openssh-server
- python2.7
- python-lxml
- python-software-properties
- qemu-kvm
- qemu-utils
@ -54,4 +55,3 @@ mnaio_pkg_cache_server_distro_packages:
mnaio_host_iptables_service: "{{ (ansible_distribution | lower + '-' + ansible_distribution_version | lower == 'ubuntu-14.04') | ternary('iptables-persistent', 'netfilter-persistent') }}"
ssh_service_name: ssh

View File

@ -0,0 +1,77 @@
---
# Copyright 2018, Rackspace US, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in witing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
- name: Create vm_servers group
hosts: localhost
gather_facts: false
tasks:
- name: VM Servers group
add_host:
name: "{{ item }}"
groups: vm_servers
when:
- hostvars[item]['server_vm'] | default(false) | bool
with_items: "{{ groups['pxe_servers'] }}"
- name: VM Status
hosts: vm_servers
gather_facts: false
tasks:
- name: VM Connectivity Check
block:
- name: Wait for VM
wait_for_connection:
connect_timeout: 10
port: 22
sleep: 20
timeout: "{{ vm_ssh_timeout }}"
rescue:
- name: Gather VM info (rescue)
virt:
command: status
name: "{{ inventory_hostname }}"
connection: local
register: vm_info
- name: Stop VM (rescue)
virt:
command: destroy
name: "{{ inventory_hostname }}"
connection: local
when: vm_info.status == 'running'
- name: Start VM (rescue)
virt:
command: start
name: "{{ inventory_hostname }}"
connection: local
- name: Wait for VM (rescue)
wait_for_connection:
connect_timeout: 10
port: 22
sleep: 20
timeout: "{{ vm_ssh_timeout }}"
register: vm_rescue
ignore_errors: true
- name: Gather VM info 2nd pass (rescue)
virt:
command: status
name: "{{ inventory_hostname }}"
connection: local
register: vm_info_2
- name: Fail if VM still offline (rescue)
fail:
msg: "{{ inventory_hostname }} is not responding and cannot be rescued"
when:
- vm_info_2.status != 'running'
- vm_rescue.failed == 'true'