From 309278389e6362202f97b24ac67d9732fd79a522 Mon Sep 17 00:00:00 2001
From: Steven Fitzpatrick <steven.fitzpatrick@att.com>
Date: Thu, 4 Jun 2020 03:48:46 -0500
Subject: [PATCH] Elasticsearch: Update Rolling Restart Procedure

This change implements the reccomended rolling restart procedure[0]
for elasticsearch-data pods.

[0] https://www.elastic.co/guide/en/elasticsearch/reference/7.x/restart-cluster.html#restart-cluster-rolling

Change-Id: I935b3681999e9bda616898f2b5e01f582ee54ed9
---
 .../templates/bin/_elasticsearch.sh.tpl       | 69 ++++++++++++-------
 1 file changed, 46 insertions(+), 23 deletions(-)

diff --git a/elasticsearch/templates/bin/_elasticsearch.sh.tpl b/elasticsearch/templates/bin/_elasticsearch.sh.tpl
index c00205fe07..27a0cda22d 100644
--- a/elasticsearch/templates/bin/_elasticsearch.sh.tpl
+++ b/elasticsearch/templates/bin/_elasticsearch.sh.tpl
@@ -34,19 +34,29 @@ function stop () {
   kill -TERM 1
 }
 
+function wait_to_join() {
+  joined=$(curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" "${ELASTICSEARCH_ENDPOINT}/_cat/nodes" | grep -w $NODE_NAME || true )
+
+  while [ -z "$joined" ]; do
+    sleep 5
+    joined=$(curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" "${ELASTICSEARCH_ENDPOINT}/_cat/nodes" | grep -w $NODE_NAME || true )
+  done
+}
+
 function allocate_data_node () {
-  CLUSTER_SETTINGS=$(curl -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" \
-    "${ELASTICSEARCH_ENDPOINT}/_cluster/settings")
-  if echo "${CLUSTER_SETTINGS}" | grep -E "${NODE_NAME}"; then
-    echo "Activate node ${NODE_NAME}"
-    curl -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" -XPUT -H 'Content-Type: application/json' \
+  if [ -f /data/restarting ]; then
+    rm /data/restarting
+    echo "Node ${NODE_NAME} has restarted. Waiting to rejoin the cluster."
+    wait_to_join
+
+    echo "Re-enabling Replica Shard Allocation"
+    curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" -XPUT -H 'Content-Type: application/json' \
      "${ELASTICSEARCH_ENDPOINT}/_cluster/settings" -d "{
-      \"transient\" :{
-          \"cluster.routing.allocation.exclude._name\" : null
+      \"persistent\": {
+        \"cluster.routing.allocation.enable\": null
       }
     }"
   fi
-  echo "Node ${NODE_NAME} is ready to be used"
 }
 
 function start_master_node () {
@@ -76,24 +86,37 @@ function start_data_node () {
   allocate_data_node &
   /usr/local/bin/docker-entrypoint.sh elasticsearch &
   function drain_data_node () {
-    echo "Prepare to migrate data off node ${NODE_NAME}"
-    echo "Move all data from node ${NODE_NAME}"
-    curl -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" -XPUT -H 'Content-Type: application/json' \
+
+    # Implement the Rolling Restart Protocol Described Here:
+    # https://www.elastic.co/guide/en/elasticsearch/reference/7.x/restart-cluster.html#restart-cluster-rolling
+
+    echo "Disabling Replica Shard Allocation"
+    curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" -XPUT -H 'Content-Type: application/json' \
      "${ELASTICSEARCH_ENDPOINT}/_cluster/settings" -d "{
-      \"transient\" :{
-          \"cluster.routing.allocation.exclude._name\" : \"${NODE_NAME}\"
+      \"persistent\": {
+        \"cluster.routing.allocation.enable\": \"primaries\"
       }
     }"
-    echo ""
-    while true ; do
-      echo -e "Wait for node ${NODE_NAME} to become empty"
-      SHARDS_ALLOCATION=$(curl -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" \
-        -XGET "${ELASTICSEARCH_ENDPOINT}/_cat/shards")
-      if ! echo "${SHARDS_ALLOCATION}" | grep -E "${NODE_NAME}"; then
-        break
-      fi
-      sleep 5
-    done
+
+    # If version < 7.6 use _flush/synced; otherwise use _flush
+    # https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-synced-flush-api.html#indices-synced-flush-api
+
+    version=$(curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" "${ELASTICSEARCH_ENDPOINT}/" | jq -r .version.number)
+
+    if [[ $version =~ "7.1" ]]; then
+      action="_flush/synced"
+    else
+      action="_flush"
+    fi
+
+    curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" -XPOST "${ELASTICSEARCH_ENDPOINT}/$action"
+
+    # TODO: Check the response of synced flush operations to make sure there are no failures.
+    # Synced flush operations that fail due to pending indexing operations are listed in the response body,
+    # although the request itself still returns a 200 OK status. If there are failures, reissue the request.
+    # (The only side effect of not doing so is slower start up times. See flush documentation linked above)
+
+    touch /data/restarting
     echo "Node ${NODE_NAME} is ready to shutdown"
     kill -TERM 1
   }