From b25c0ee4772fdf04fe6f263a8e81571204f706ff Mon Sep 17 00:00:00 2001
From: Mark Goddard <mark@stackhpc.com>
Date: Sun, 17 Mar 2019 17:48:54 +0000
Subject: [PATCH] Fix MariaDB 10.3 upgrade

Upgrading MariaDB from Rocky to Stein currently fails, with the new
container left continually restarting. The problem is that the Rocky
container does not shutdown cleanly, leaving behind state that the new
container cannot recover. The container does not shutdown cleanly
because we run dumb-init with a --single-child argument, causing it to
forward signals to only the process executed by dumb-init. In our case
this is mysqld_safe, which ignores various signals, including SIGTERM.
After a (default 10 second) timeout, Docker then kills the container.

A Kolla change [1] removes the --single-child argument from dumb-init
for the MariaDB container, however we still need to support upgrading
from Rocky images that don't have this change. To do that, we add new
handlers to execute 'mysqladmin shutdown' to cleanly shutdown the
service.

A second issue with the current upgrade approach is that we don't
execute mysql_upgrade after starting the new service. This can leave the
database state using the format of the previous release. This patch also
adds handlers to execute mysql_upgrade.

[1] https://review.openstack.org/644244

Depends-On: https://review.openstack.org/644244
Depends-On: https://review.openstack.org/645990
Change-Id: I08a655a359ff9cfa79043f2166dca59199c7d67f
Closes-Bug: #1820325
---
 ansible/roles/mariadb/handlers/main.yml       | 152 +++++++++++++++---
 .../roles/mariadb/tasks/bootstrap_cluster.yml |   2 +-
 ansible/roles/mariadb/tasks/config.yml        |  16 +-
 ansible/roles/mariadb/templates/galera.cnf.j2 |   5 +
 4 files changed, 138 insertions(+), 37 deletions(-)

diff --git a/ansible/roles/mariadb/handlers/main.yml b/ansible/roles/mariadb/handlers/main.yml
index d2b16b5aec..d378839824 100644
--- a/ansible/roles/mariadb/handlers/main.yml
+++ b/ansible/roles/mariadb/handlers/main.yml
@@ -20,11 +20,9 @@
   when:
     - bootstrap_host is defined
     - bootstrap_host == inventory_hostname
+  listen: Bootstrap MariaDB cluster
   notify:
-    - wait first mariadb container
-    - restart slave mariadb
-    - restart master mariadb
-
+    - restart mariadb
 
 # TODO(jeffrey4l), remove the task check when the wait_for bug is fixed
 # https://github.com/ansible/ansible-modules-core/issues/2788
@@ -42,12 +40,45 @@
   when:
     - bootstrap_host is defined
     - bootstrap_host == inventory_hostname
+  listen: Bootstrap MariaDB cluster
+
+# NOTE(mgoddard): In Rocky the MariaDB image had an issue where it would not
+# stop on demand, and would result in Docker forcibly killing the container.
+# This could lead to a failed upgrade if the new image is unable to recover
+# from the crash. See https://bugs.launchpad.net/kolla-ansible/+bug/1820325.
+# TODO(mgoddard): Remove this task in Train.
+- name: shutdown slave mariadb
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "start_container"
+    command: >-
+      bash -c '
+      sudo -E kolla_set_configs &&
+      mysqladmin shutdown --host={{ api_interface_address }} --user=root --password={{ database_password }}
+      '
+    common_options: "{{ docker_common_options }}"
+    detach: False
+    name: "mariadb_shutdown"
+    image: "{{ service.image }}"
+    volumes: "{{ service.volumes }}"
+    dimensions: "{{ service.dimensions }}"
+    labels:
+      UPGRADE:
+    restart_policy: "never"
+  no_log: true
+  when:
+    - kolla_action != "config"
+    - has_cluster | bool
+    - inventory_hostname != master_host
+  listen: restart mariadb
 
 - name: restart slave mariadb
   vars:
     service_name: "mariadb"
     service: "{{ mariadb_services[service_name] }}"
-    mariadb_container: "{{ check_mariadb_containers.results|selectattr('item.key', 'equalto', service_name)|first }}"
   become: true
   kolla_docker:
     action: "recreate_or_restart_container"
@@ -59,15 +90,7 @@
   when:
     - kolla_action != "config"
     - inventory_hostname != master_host
-    - inventory_hostname in groups[service.group]
-    - service.enabled | bool
-    - mariadb_config_json.changed | bool
-      or mariadb_galera_conf.changed | bool
-      or mariadb_wsrep_notify.changed | bool
-      or mariadb_container.changed | bool
-      or bootstrap_host is defined
-  notify:
-    - wait for slave mariadb
+  listen: restart mariadb
 
 # TODO(jeffrey4l), remove the task check when the wait_for bug is fixed
 # https://github.com/ansible/ansible-modules-core/issues/2788
@@ -85,12 +108,72 @@
   when:
     - kolla_action != "config"
     - inventory_hostname != master_host
+  listen: restart mariadb
+
+- name: run upgrade on slave
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "start_container"
+    common_options: "{{ docker_common_options }}"
+    detach: False
+    dimensions: "{{ service.dimensions }}"
+    environment:
+      KOLLA_UPGRADE:
+      KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
+      DB_HOST: "{{ api_interface_address }}"
+      DB_PORT: "{{ mariadb_port }}"
+      DB_ROOT_PASSWORD: "{{ database_password }}"
+    image: "{{ service.image }}"
+    labels:
+      UPGRADE:
+    name: "upgrade_mariadb"
+    restart_policy: "never"
+    volumes: "{{ service.volumes }}"
+  no_log: true
+  when:
+    - kolla_action == "upgrade"
+    - inventory_hostname != master_host
+  listen: restart mariadb
+
+# NOTE(mgoddard): In Rocky the MariaDB image had an issue where it would not
+# stop on demand, and would result in Docker forcibly killing the container.
+# This could lead to a failed upgrade if the new image is unable to recover
+# from the crash. See https://bugs.launchpad.net/kolla-ansible/+bug/1820325.
+# TODO(mgoddard): Remove this task in Train.
+- name: shutdown master mariadb
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "start_container"
+    command: >-
+      bash -c '
+      sudo -E kolla_set_configs &&
+      mysqladmin shutdown --host={{ api_interface_address }} --user=root --password={{ database_password }}
+      '
+    common_options: "{{ docker_common_options }}"
+    detach: False
+    name: "mariadb_shutdown"
+    image: "{{ service.image }}"
+    volumes: "{{ service.volumes }}"
+    dimensions: "{{ service.dimensions }}"
+    labels:
+      UPGRADE:
+    restart_policy: "never"
+  no_log: true
+  when:
+    - kolla_action != "config"
+    - inventory_hostname == master_host
+  listen: restart mariadb
 
 - name: restart master mariadb
   vars:
     service_name: "mariadb"
     service: "{{ mariadb_services[service_name] }}"
-    mariadb_container: "{{ check_mariadb_containers.results|selectattr('item.key', 'equalto', service_name)|first }}"
   become: true
   kolla_docker:
     action: "recreate_or_restart_container"
@@ -102,15 +185,7 @@
   when:
     - kolla_action != "config"
     - inventory_hostname == master_host
-    - inventory_hostname in groups[service.group]
-    - service.enabled | bool
-    - mariadb_config_json.changed | bool
-      or mariadb_galera_conf.changed | bool
-      or mariadb_wsrep_notify.changed | bool
-      or mariadb_container.changed | bool
-      or bootstrap_host is defined
-  notify:
-    - Waiting for master mariadb
+  listen: restart mariadb
 
 # TODO(jeffrey4l), remove the task check when the wait_for bug is fixed
 # https://github.com/ansible/ansible-modules-core/issues/2788
@@ -128,3 +203,32 @@
   when:
     - kolla_action != "config"
     - inventory_hostname == master_host
+  listen: restart mariadb
+
+- name: run upgrade on master
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "start_container"
+    common_options: "{{ docker_common_options }}"
+    detach: False
+    dimensions: "{{ service.dimensions }}"
+    environment:
+      KOLLA_UPGRADE:
+      KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
+      DB_HOST: "{{ api_interface_address }}"
+      DB_PORT: "{{ mariadb_port }}"
+      DB_ROOT_PASSWORD: "{{ database_password }}"
+    image: "{{ service.image }}"
+    labels:
+      UPGRADE:
+    name: "upgrade_mariadb"
+    restart_policy: "never"
+    volumes: "{{ service.volumes }}"
+  no_log: true
+  when:
+    - kolla_action == "upgrade"
+    - inventory_hostname == master_host
+  listen: restart mariadb
diff --git a/ansible/roles/mariadb/tasks/bootstrap_cluster.yml b/ansible/roles/mariadb/tasks/bootstrap_cluster.yml
index a5425e5e1f..aeb8bb7fa8 100644
--- a/ansible/roles/mariadb/tasks/bootstrap_cluster.yml
+++ b/ansible/roles/mariadb/tasks/bootstrap_cluster.yml
@@ -20,7 +20,7 @@
     restart_policy: "never"
     volumes: "{{ service.volumes }}"
   notify:
-    - Starting first MariaDB container
+    - Bootstrap MariaDB cluster
 
 - set_fact:
     bootstrap_host: "{{ inventory_hostname }}"
diff --git a/ansible/roles/mariadb/tasks/config.yml b/ansible/roles/mariadb/tasks/config.yml
index 83c1a273f9..cb05451974 100644
--- a/ansible/roles/mariadb/tasks/config.yml
+++ b/ansible/roles/mariadb/tasks/config.yml
@@ -48,13 +48,11 @@
     dest: "{{ node_config_directory }}/{{ service_name }}/config.json"
     mode: "0660"
   become: true
-  register: mariadb_config_json
   when:
     - inventory_hostname in groups[service.group]
     - service.enabled | bool
   notify:
-    - restart slave mariadb
-    - restart master mariadb
+    - restart mariadb
 
 - name: Copying over galera.cnf
   vars:
@@ -68,13 +66,11 @@
     dest: "{{ node_config_directory }}/{{ service_name }}/galera.cnf"
     mode: "0660"
   become: true
-  register: mariadb_galera_conf
   when:
     - inventory_hostname in groups[service.group]
     - service.enabled | bool
   notify:
-    - restart slave mariadb
-    - restart master mariadb
+    - restart mariadb
 
 - name: Copying over wsrep-notify.sh
   template:
@@ -82,14 +78,12 @@
     dest: "{{ node_config_directory }}/{{ item.key }}/wsrep-notify.sh"
     mode: "0770"
   become: true
-  register: mariadb_wsrep_notify
   when:
     - inventory_hostname in groups[item.value.group]
     - item.value.enabled | bool
   with_dict: "{{ mariadb_services }}"
   notify:
-    - restart slave mariadb
-    - restart master mariadb
+    - restart mariadb
 
 - name: Check mariadb containers
   become: true
@@ -100,12 +94,10 @@
     image: "{{ item.value.image }}"
     volumes: "{{ item.value.volumes }}"
     dimensions: "{{ item.value.dimensions }}"
-  register: check_mariadb_containers
   when:
     - kolla_action != "config"
     - inventory_hostname in groups[item.value.group]
     - item.value.enabled | bool
   with_dict: "{{ mariadb_services }}"
   notify:
-    - restart slave mariadb
-    - restart master mariadb
+    - restart mariadb
diff --git a/ansible/roles/mariadb/templates/galera.cnf.j2 b/ansible/roles/mariadb/templates/galera.cnf.j2
index 7e9611faa3..5ee209cd37 100644
--- a/ansible/roles/mariadb/templates/galera.cnf.j2
+++ b/ansible/roles/mariadb/templates/galera.cnf.j2
@@ -54,5 +54,10 @@ innodb_buffer_pool_size = '{{ dynamic_pool_size_mb }}M'
 innodb_buffer_pool_size = '8192M'
 {% endif %}
 
+# The default value for innodb_lock_schedule_algorithm is VATS, but this does
+# not work with galera. Set FCFS explicitly to avoid a warning.
+# https://mariadb.com/kb/en/library/innodb-system-variables/#innodb_lock_schedule_algorithm.
+innodb_lock_schedule_algorithm = FCFS
+
 [server]
 pid-file=/var/lib/mysql/mariadb.pid