From 86f373a198620a7082db8e243644fd8d53802c73 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Thu, 27 Jun 2019 12:17:17 +0100 Subject: [PATCH] Fixes for MariaDB bootstrap and recovery * Fix wsrep sequence number detection. Log message format is 'WSREP: Recovered position: :' but we were picking out the UUID rather than the sequence number. This is as good as random. * Add become: true to log file reading and removal since I4a5ebcedaccb9261dbc958ec67e8077d7980e496 added become: true to the 'docker cp' command which creates it. * Don't run handlers during recovery. If the config files change we would end up restarting the cluster twice. * Wait for wsrep recovery container completion (don't detach). This avoids a potential race between wsrep recovery and the subsequent 'stop_container'. * Finally, we now wait for the bootstrap host to report that it is in an OPERATIONAL state. Without this we can see errors where the MariaDB cluster is not ready when used by other services. Change-Id: Iaf7862be1affab390f811fc485fd0eb6879fd583 Closes-Bug: #1834467 --- ansible/roles/mariadb/handlers/main.yml | 24 +++++++++++++ .../roles/mariadb/tasks/recover_cluster.yml | 35 ++++++++++++++----- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/ansible/roles/mariadb/handlers/main.yml b/ansible/roles/mariadb/handlers/main.yml index d033561815..76ff2757cf 100644 --- a/ansible/roles/mariadb/handlers/main.yml +++ b/ansible/roles/mariadb/handlers/main.yml @@ -42,6 +42,24 @@ - bootstrap_host == inventory_hostname listen: Bootstrap MariaDB cluster +- name: Wait for MariaDB to become operational + become: true + command: >- + docker exec {{ mariadb_service.container_name }} + mysql -uroot -p{{ database_password }} + --silent --skip-column-names + -e 'SHOW STATUS LIKE "wsrep_evs_state"' + changed_when: false + register: result + until: '"OPERATIONAL" in result.stdout' + retries: 10 + delay: 6 + no_log: true + when: + - bootstrap_host is defined + - bootstrap_host == inventory_hostname + listen: Bootstrap MariaDB cluster + - name: restart slave mariadb vars: service_name: "mariadb" @@ -57,6 +75,7 @@ when: - kolla_action != "config" - inventory_hostname != master_host + - not mariadb_recover | default(false) listen: restart mariadb # TODO(jeffrey4l), remove the task check when the wait_for bug is fixed @@ -75,6 +94,7 @@ when: - kolla_action != "config" - inventory_hostname != master_host + - not mariadb_recover | default(false) listen: restart mariadb - name: run upgrade on slave @@ -103,6 +123,7 @@ when: - kolla_action == "upgrade" - inventory_hostname != master_host + - not mariadb_recover | default(false) listen: restart mariadb - name: restart master mariadb @@ -120,6 +141,7 @@ when: - kolla_action != "config" - inventory_hostname == master_host + - not mariadb_recover | default(false) listen: restart mariadb # TODO(jeffrey4l), remove the task check when the wait_for bug is fixed @@ -138,6 +160,7 @@ when: - kolla_action != "config" - inventory_hostname == master_host + - not mariadb_recover | default(false) listen: restart mariadb - name: run upgrade on master @@ -166,4 +189,5 @@ when: - kolla_action == "upgrade" - inventory_hostname == master_host + - not mariadb_recover | default(false) listen: restart mariadb diff --git a/ansible/roles/mariadb/tasks/recover_cluster.yml b/ansible/roles/mariadb/tasks/recover_cluster.yml index 1181b1a7d8..1dbc3a5f7c 100644 --- a/ansible/roles/mariadb/tasks/recover_cluster.yml +++ b/ansible/roles/mariadb/tasks/recover_cluster.yml @@ -26,36 +26,37 @@ name: "{{ mariadb_service.container_name }}" action: "stop_container" + # Run wsrep recovery with detach=false to block until completion. Use a + # different container name to avoid the mariadb container being removed. - name: Run MariaDB wsrep recovery become: true kolla_docker: action: "start_container" common_options: "{{ docker_common_options }}" + detach: false environment: KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}" BOOTSTRAP_ARGS: "--wsrep-recover" image: "{{ mariadb_service.image }}" labels: BOOTSTRAP: - name: "{{ mariadb_service.container_name }}" + name: mariadb_wsrep_recovery restart_policy: "never" volumes: "{{ mariadb_service.volumes }}" - - name: Stop MariaDB containers - become: true - kolla_docker: - name: "{{ mariadb_service.container_name }}" - action: "stop_container" - - name: Copying MariaDB log file to /tmp become: true shell: "docker cp {{ mariadb_service.container_name }}:/var/log/kolla/mariadb/mariadb.log /tmp/mariadb_tmp.log" + # Look for sequence number in logs. Format is: + # WSREP: Recovered position: :. - name: Get MariaDB wsrep recovery seqno - shell: tail -n 200 /tmp/mariadb_tmp.log | awk -F" " '$0~/Recovered position/{print $NF;exit;}' | awk -F":" '{print $1}' + become: true + shell: tail -n 200 /tmp/mariadb_tmp.log | awk -F" " '$0~/Recovered position/{print $NF;exit;}' | awk -F":" '{print $2}' register: wsrep_recovery_seqno - name: Removing MariaDB log file from /tmp + become: true file: path: /tmp/mariadb_tmp.log state: absent @@ -104,6 +105,7 @@ - bootstrap_host == inventory_hostname - name: Set grastate.dat file from MariaDB container in bootstrap host + become: true lineinfile: dest: /tmp/kolla_mariadb_grastate.dat regexp: 'safe_to_bootstrap:(.*)$' @@ -162,6 +164,23 @@ - bootstrap_host is defined - bootstrap_host == inventory_hostname +- name: Wait for MariaDB to become operational + become: true + command: >- + docker exec {{ mariadb_service.container_name }} + mysql -uroot -p{{ database_password }} + --silent --skip-column-names + -e 'SHOW STATUS LIKE "wsrep_evs_state"' + changed_when: false + register: result + until: '"OPERATIONAL" in result.stdout' + retries: 10 + delay: 6 + no_log: true + when: + - bootstrap_host is defined + - bootstrap_host == inventory_hostname + - name: Restart slave MariaDB container become: true kolla_docker: