From ce680bcfe2208f19a23630ee3feff4b9bd40c595 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rados=C5=82aw=20Piliszek?= <radoslaw.piliszek@gmail.com>
Date: Sun, 16 Jun 2019 20:37:35 +0200
Subject: [PATCH] Avoid parallel discover_hosts (nova-related race condition)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In a rare event both kolla-ansible and nova-scheduler try to do
the mapping at the same time and one of them fails.
Since kolla-ansible runs host discovery on each deployment,
there is no need to change the default of no periodic host discovery.

I added some notes for future. They are not critical.
I made the decision explicit in the comments.
I changed the task name to satisfy recommendations.
I removed the variable because it is not used (to avoid future doubts).

Closes-Bug: #1832987

Change-Id: I3128472f028a2dbd7ace02abc179a9629ad74ceb
Signed-off-by: Radosław Piliszek <radoslaw.piliszek@gmail.com>
---
 ansible/roles/nova/tasks/discover_computes.yml | 5 +++--
 ansible/roles/nova/templates/nova.conf.j2      | 5 ++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/ansible/roles/nova/tasks/discover_computes.yml b/ansible/roles/nova/tasks/discover_computes.yml
index 1a0c904f26..647bd5d59c 100644
--- a/ansible/roles/nova/tasks/discover_computes.yml
+++ b/ansible/roles/nova/tasks/discover_computes.yml
@@ -23,15 +23,16 @@
     - nova_compute_services is success
     - nova_compute_services.stdout | from_json | length != 0
 
-- name: Discovering nova hosts
+# TODO(yoctozepto): no need to do --by-service if ironic not used
+- name: Discover nova hosts
   become: true
   command: >
     docker exec nova_api nova-manage cell_v2 discover_hosts --by-service
-  register: discover_hosts
   changed_when: False
   run_once: True
   delegate_to: "{{ groups['nova-api'][0] }}"
 
+# NOTE(yoctozepto): SIGHUP is probably unnecessary
 - name: Refresh cell cache in nova scheduler
   become: true
   command: docker kill --signal HUP nova_scheduler
diff --git a/ansible/roles/nova/templates/nova.conf.j2 b/ansible/roles/nova/templates/nova.conf.j2
index 04b196b916..39bb43d9f9 100644
--- a/ansible/roles/nova/templates/nova.conf.j2
+++ b/ansible/roles/nova/templates/nova.conf.j2
@@ -254,7 +254,10 @@ secure_proxy_ssl_header = HTTP_X_FORWARDED_PROTO
 
 [scheduler]
 max_attempts = 10
-discover_hosts_in_cells_interval = 60
+# NOTE(yoctozepto): kolla-ansible handles cell mapping by itself on each deploy
+# periodic run must be disabled to avoid random failures (where both try to map)
+# -1 is default and means periodic discovery is disabled
+discover_hosts_in_cells_interval = -1
 
 {% if enable_nova_fake | bool %}
 default_filters = RetryFilter,AvailabilityZoneFilter,ComputeFilter,ComputeCapabilitiesFilter,ImagePropertiesFilter