From 482e845d92230c0d69a20fb4a151180a717005ad Mon Sep 17 00:00:00 2001
From: Dave Wilde <david.wilde@rackspace.com>
Date: Fri, 15 Jun 2018 10:35:14 -0500
Subject: [PATCH] Improve multi-node AIO robustness

In order to improve the readability and robustness of the mnaio feature
I have replaced the shell out to virsh tasks to use the virt module
where available.  I have also created a vm-status play that will
hopefully help resolve SSH failures into the VMs.  This play utilizes
the block/rescue/handler pattern to attempt to restart the VM once if
it fails the initial SSH check.  Hopefully this will reduce the SSH
failures due to a suck VM.  This adds a new variable called
vm_ssh_timeout which allows the deployer an easy place to override the
default timeout.  The python-lxml package is needed for the virt module.

Change-Id: I027556b71a8c26d08a56b4ffa56b2eeaf1cbabe9
---
 multi-node-aio/playbooks/deploy-vms.yml       | 60 +++++++------
 multi-node-aio/playbooks/group_vars/all.yml   |  5 +-
 .../kvm/{kvm-vm.xml => kvm-vm.xml.j2}         |  0
 ...te.xml => libvirt-network-template.xml.j2} |  0
 multi-node-aio/playbooks/setup-host.yml       | 86 ++++++++++++-------
 multi-node-aio/playbooks/vars/ubuntu.yml      |  2 +-
 multi-node-aio/playbooks/vm-status.yml        | 77 +++++++++++++++++
 7 files changed, 165 insertions(+), 65 deletions(-)
 rename multi-node-aio/playbooks/kvm/{kvm-vm.xml => kvm-vm.xml.j2} (100%)
 rename multi-node-aio/playbooks/kvm/{libvirt-network-template.xml => libvirt-network-template.xml.j2} (100%)
 create mode 100644 multi-node-aio/playbooks/vm-status.yml

diff --git a/multi-node-aio/playbooks/deploy-vms.yml b/multi-node-aio/playbooks/deploy-vms.yml
index 3b9fa291..c40c63a7 100644
--- a/multi-node-aio/playbooks/deploy-vms.yml
+++ b/multi-node-aio/playbooks/deploy-vms.yml
@@ -1,5 +1,5 @@
 ---
-# Copyright 2017, Rackspace US, Inc.
+# Copyright 2018, Rackspace US, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,7 +32,9 @@
         - always
 
     - name: Stop running VMs
-      command: "virsh destroy {{ hostvars[item]['server_hostname'] }}"
+      virt:
+        name: "{{ hostvars[item]['server_hostname'] }}"
+        command: destroy
       failed_when: false
       when:
         - hostvars[item]['server_vm'] | default(false) | bool
@@ -96,17 +98,6 @@
             - hostvars[item]['server_vm'] | default(false) | bool
           with_items: "{{ groups['pxe_servers'] }}"
 
-    - name: Create the VM template
-      template:
-        src: kvm/kvm-vm.xml
-        dest: "/etc/libvirt/qemu/{{ hostvars[item]['server_hostname'] }}.xml"
-        mode: 0644
-        owner: root
-        group: root
-      when:
-        - hostvars[item]['server_vm'] | default(false) | bool
-      with_items: "{{ groups['pxe_servers'] }}"
-
     - name: Wait for guest capabilities to appear
       command: "virsh capabilities"
       register: virsh_caps
@@ -115,21 +106,35 @@
       delay: 10
 
     - name: Define the VM
-      command: "virsh define /etc/libvirt/qemu/{{ hostvars[item]['server_hostname'] }}.xml"
+      virt:
+        name: "{{ hostvars[item]['server_hostname'] }}"
+        command: define
+        xml: "{{ lookup('template', 'kvm/kvm-vm.xml.j2') }}"
       failed_when: false
       when:
         - hostvars[item]['server_vm'] | default(false) | bool
       with_items: "{{ groups['pxe_servers'] }}"
 
-    - name: Create the VM
-      command: "virsh create /etc/libvirt/qemu/{{ hostvars[item]['server_hostname'] }}.xml"
-      failed_when: false
+    - name: Get the VM xml
+      virt:
+        command: get_xml
+        name: "{{ hostvars[item]['server_hostname'] }}"
+      register: vm_xml
       when:
         - hostvars[item]['server_vm'] | default(false) | bool
       with_items: "{{ groups['pxe_servers'] }}"
 
+    - name: Write the VM xml
+      copy:
+        content: "{{ item.get_xml }}"
+        dest: "/etc/libvirt/qemu/{{ item.item }}.xml"
+      with_items: "{{ vm_xml.results }}"
+
     - name: Start the VM
-      command: "virsh start {{ hostvars[item]['server_hostname'] }}"
+      virt:
+        name: "{{ hostvars[item]['server_hostname'] }}"
+        command: start
+        state: running
       failed_when: false
       when:
         - hostvars[item]['server_vm'] | default(false) | bool
@@ -144,6 +149,10 @@
       with_items: "{{ groups['pxe_servers'] }}"
 
 
+- name: Check VM Connectivity
+  import_playbook: vm-status.yml
+
+
 - name: Create vm_servers group
   hosts: localhost
   gather_facts: false
@@ -160,23 +169,12 @@
       with_items: "{{ groups['pxe_servers'] }}"
 
 
-- name: Wait for deploy host
+- name: VM Host Setup
   hosts: vm_servers
   gather_facts: false
   any_errors_fatal: true
   tasks:
-    - name: Wait for connectivity 1
-      local_action:
-        module: wait_for
-        host: "{{ ansible_host }}"
-        connect_timeout: 10
-        port: 22
-        sleep: 20
-        timeout: 1500
-        state: started
-        search_regex: OpenSSH
-
-    - name: copy host keys
+    - name: Copy Host Keys
       copy:
         src: "{{ item.src }}"
         dest: "{{ item.dest }}"
diff --git a/multi-node-aio/playbooks/group_vars/all.yml b/multi-node-aio/playbooks/group_vars/all.yml
index a207c475..c2a348b4 100644
--- a/multi-node-aio/playbooks/group_vars/all.yml
+++ b/multi-node-aio/playbooks/group_vars/all.yml
@@ -30,7 +30,10 @@ default_container_tech: "{{ container_tech | default('lxc') }}"
 
 ipxe_kernel_url: "http://boot.ipxe.org/ipxe.lkrn"
 
- # IP address, or domain name of the TFTP server
+# The timeout for the SSH check to the vm_servers
+vm_ssh_timeout: 1500
+
+# IP address, or domain name of the TFTP server
 tftp_server: "{{ hostvars[groups['pxe_hosts'][0]]['ansible_host'] | default(ansible_host) }}"
 # tftp_ssh_key: ''  # user defined ssh key, used to access the host
 tftp_port: 69
diff --git a/multi-node-aio/playbooks/kvm/kvm-vm.xml b/multi-node-aio/playbooks/kvm/kvm-vm.xml.j2
similarity index 100%
rename from multi-node-aio/playbooks/kvm/kvm-vm.xml
rename to multi-node-aio/playbooks/kvm/kvm-vm.xml.j2
diff --git a/multi-node-aio/playbooks/kvm/libvirt-network-template.xml b/multi-node-aio/playbooks/kvm/libvirt-network-template.xml.j2
similarity index 100%
rename from multi-node-aio/playbooks/kvm/libvirt-network-template.xml
rename to multi-node-aio/playbooks/kvm/libvirt-network-template.xml.j2
diff --git a/multi-node-aio/playbooks/setup-host.yml b/multi-node-aio/playbooks/setup-host.yml
index e79be44c..d81b4e4b 100644
--- a/multi-node-aio/playbooks/setup-host.yml
+++ b/multi-node-aio/playbooks/setup-host.yml
@@ -1,5 +1,5 @@
 ---
-# Copyright 2017, Rackspace US, Inc.
+# Copyright 2018, Rackspace US, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -215,30 +215,32 @@
       with_dict: "{{ mnaio_host_networks }}"
       when: mnaio_bridges is changed
 
-    - name: Disable virsh default network
-      shell: |
-        if virsh net-list |  grep -qw "default"; then
-          virsh net-autostart default --disable
-          virsh net-destroy default
-        fi
+    - name: Disable default virt network
+      virt_net:
+        name: "default"
+        state: inactive
+        autostart: no
 
-    - name: Drop virsh network configs
-      template:
-        src: "kvm/libvirt-network-template.xml"
-        dest: "/etc/libvirt/qemu/networks/{{ item.value.iface }}.xml"
-        mode: "0644"
-        owner: root
-        group: root
-      with_dict: "{{ mnaio_host_networks }}"
+    - name: List virt network(s)
+      virt_net:
+        command: list_nets
+      register: vm_networks
 
-    - name: Enable new virsh network(s)
-      shell: |
-        if ! virsh net-list |  grep -qw "{{ item.value.iface }}"; then
-          virsh net-define --file /etc/libvirt/qemu/networks/{{ item.value.iface }}.xml
-          virsh net-create --file /etc/libvirt/qemu/networks/{{ item.value.iface }}.xml
-          virsh net-autostart {{ item.value.iface }} || ture
-        fi
+    - name: Define virt network(s)
+      virt_net:
+        command: define
+        name: "{{ item.value.iface }}"
+        xml: "{{ lookup('template', 'kvm/libvirt-network-template.xml.j2') }}"
       with_dict: "{{ mnaio_host_networks }}"
+      when: "item.value.iface not in vm_networks.list_nets"
+
+    - name: Create virt network(s)
+      virt_net:
+        command: create
+        name: "{{ item.value.iface }}"
+        autostart: true
+      with_dict: "{{ mnaio_host_networks }}"
+      when: "item.value.iface not in vm_networks.list_nets"
 
     - name: Locate data volume
       command: "vgdisplay vg01"
@@ -276,27 +278,47 @@
         - default_vm_disk_mode | default('lvm') == "lvm"
         - data_volume.rc != 0
 
-    - name: Locate virsh data volume
-      command: "virsh pool-info vg01"
+    - name: Locate virt data volume
+      virt_pool:
+        name: "vg01"
+        command: info
       failed_when: false
       when:
         - default_vm_disk_mode | default('lvm') == "lvm"
-      register: virsh_data_volume
+      register: virt_data_volume
 
     - name: Create /etc/libvirt/storage directory
       file:
         path: "/etc/libvirt/storage/"
         state: "directory"
 
-    - name: Create virsh data volume
-      shell: |
-        virsh pool-create-as vg01 logical
-        virsh pool-dumpxml vg01 > /etc/libvirt/storage/vg01.xml
-        virsh pool-define /etc/libvirt/storage/vg01.xml
-        virsh pool-autostart vg01 || true
+    - name: Create virt data volume
+      block:
+        - name: Create virt pool
+          virt_pool:
+            command: create
+            name: vg01
+
+        - name: Get virt pool xml
+          virt_pool:
+            command: get_xml
+            name: vg01
+          register: virt_pool_xml
+
+        - name: Write data volume xml
+          copy:
+            content: "{{ virt_pool_xml.get_xml }}"
+            dest: "/etc/libvirt/storage/vg01.xml"
+
+        - name: Define virt data volume
+          virt_pool:
+            command: define
+            name: vg01
+            xml: "/etc/libvirt/storage/vg01.xml"
+            autostart: true
       when:
         - default_vm_disk_mode | default('lvm') == "lvm"
-        - virsh_data_volume.rc != 0
+        - virt_data_volume.pools is not defined
 
     - name: Load virtio kernel modules
       shell: |
diff --git a/multi-node-aio/playbooks/vars/ubuntu.yml b/multi-node-aio/playbooks/vars/ubuntu.yml
index adaeea65..5d2d04e3 100644
--- a/multi-node-aio/playbooks/vars/ubuntu.yml
+++ b/multi-node-aio/playbooks/vars/ubuntu.yml
@@ -31,6 +31,7 @@ mnaio_host_distro_packages:
   - ntp
   - openssh-server
   - python2.7
+  - python-lxml
   - python-software-properties
   - qemu-kvm
   - qemu-utils
@@ -54,4 +55,3 @@ mnaio_pkg_cache_server_distro_packages:
 mnaio_host_iptables_service: "{{ (ansible_distribution | lower + '-' + ansible_distribution_version | lower == 'ubuntu-14.04') | ternary('iptables-persistent', 'netfilter-persistent') }}"
 
 ssh_service_name: ssh
-
diff --git a/multi-node-aio/playbooks/vm-status.yml b/multi-node-aio/playbooks/vm-status.yml
new file mode 100644
index 00000000..7154c4ef
--- /dev/null
+++ b/multi-node-aio/playbooks/vm-status.yml
@@ -0,0 +1,77 @@
+---
+# Copyright 2018, Rackspace US, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in witing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+- name: Create vm_servers group
+  hosts: localhost
+  gather_facts: false
+  tasks:
+    - name: VM Servers group
+      add_host:
+        name: "{{ item }}"
+        groups: vm_servers
+      when:
+        - hostvars[item]['server_vm'] | default(false) | bool
+      with_items: "{{ groups['pxe_servers'] }}"
+
+- name: VM Status
+  hosts: vm_servers
+  gather_facts: false
+  tasks:
+    - name: VM Connectivity Check
+      block:
+        - name: Wait for VM
+          wait_for_connection:
+            connect_timeout: 10
+            port: 22
+            sleep: 20
+            timeout: "{{ vm_ssh_timeout }}"
+      rescue:
+        - name: Gather VM info (rescue)
+          virt:
+            command: status
+            name: "{{ inventory_hostname }}"
+          connection: local
+          register: vm_info
+        - name: Stop VM (rescue)
+          virt:
+            command: destroy
+            name: "{{ inventory_hostname }}"
+          connection: local
+          when: vm_info.status == 'running'
+        - name: Start VM (rescue)
+          virt:
+            command: start
+            name: "{{ inventory_hostname }}"
+          connection: local
+        - name: Wait for VM (rescue)
+          wait_for_connection:
+            connect_timeout: 10
+            port: 22
+            sleep: 20
+            timeout: "{{ vm_ssh_timeout }}"
+          register: vm_rescue
+          ignore_errors: true
+        - name: Gather VM info 2nd pass (rescue)
+          virt:
+            command: status
+            name: "{{ inventory_hostname }}"
+          connection: local
+          register: vm_info_2
+        - name: Fail if VM still offline (rescue)
+          fail:
+            msg: "{{ inventory_hostname }} is not responding and cannot be rescued"
+          when:
+            - vm_info_2.status != 'running'
+            - vm_rescue.failed == 'true'