From 49b58b7e7d51ac9fc793512a2a26403ac6af65e3 Mon Sep 17 00:00:00 2001
From: Hemachandra Reddy <hr858f@att.com>
Date: Fri, 1 Mar 2019 16:49:26 +0000
Subject: [PATCH] Health probe for Nova components

Health probe for Nova pods is used for both liveness
and readiness probe.

nova-compute, nova-conductor, nova-consoleauth and nova-scheduler:
Check if the rpc socket status on the nova pods to rabbitmq and
database are in established state.
sends an RPC call with a non-existence method to component's queue.
Probe is success if agent returns with NoSuchMethod error.
If agent is not reachable or fails to respond in time,
returns failure to probe.

novnc/spice proxy: uses Kubernetes tcp probe on corresponding ports
they expose.
Added code to catch nova config file not present exception.

Change-Id: Ib8e4b93486588320fd2d562c3bc90b65844e52e5
---
 nova/templates/bin/_health-probe.py.tpl    | 208 +++++++++++++++++++++
 nova/templates/configmap-bin.yaml          |   2 +
 nova/templates/daemonset-compute.yaml      |  29 +++
 nova/templates/deployment-conductor.yaml   |  29 +++
 nova/templates/deployment-consoleauth.yaml |  29 +++
 nova/templates/deployment-novncproxy.yaml  |   8 +
 nova/templates/deployment-scheduler.yaml   |  29 +++
 nova/templates/deployment-spiceproxy.yaml  |   8 +
 8 files changed, 342 insertions(+)
 create mode 100644 nova/templates/bin/_health-probe.py.tpl

diff --git a/nova/templates/bin/_health-probe.py.tpl b/nova/templates/bin/_health-probe.py.tpl
new file mode 100644
index 0000000000..683387476a
--- /dev/null
+++ b/nova/templates/bin/_health-probe.py.tpl
@@ -0,0 +1,208 @@
+#!/usr/bin/env python2
+
+# Copyright 2019 The Openstack-Helm Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Health probe script for OpenStack service that uses RPC/unix domain socket for
+communication. Check's the RPC tcp socket status on the process and send
+message to service through rpc call method and expects a reply. It is expected
+to receive failure from the service's RPC server as the method does not exist.
+
+Script returns failure to Kubernetes only when
+  a. TCP socket for the RPC communication are not established.
+  b. service is not reachable or
+  c. service times out sending a reply.
+
+sys.stderr.write() writes to pod's events on failures.
+
+Usage example for Nova Compute:
+# python health-probe-rpc.py --config-file /etc/nova/nova.conf \
+#  --service-queue-name compute
+
+"""
+
+import psutil
+import socket
+import sys
+
+from oslo_config import cfg
+from oslo_context import context
+from oslo_log import log
+import oslo_messaging
+
+
+tcp_established = "ESTABLISHED"
+
+
+def check_service_status(transport):
+    """Verify service status. Return success if service consumes message"""
+    try:
+        target = oslo_messaging.Target(topic=cfg.CONF.service_queue_name,
+                                       server=socket.gethostname())
+        client = oslo_messaging.RPCClient(transport, target,
+                                          timeout=60,
+                                          retry=2)
+        client.call(context.RequestContext(),
+                    'pod_health_probe_method_ignore_errors')
+    except oslo_messaging.exceptions.MessageDeliveryFailure:
+        # Log to pod events
+        sys.stderr.write("Health probe unable to reach message bus")
+        sys.exit(0)  # return success
+    except oslo_messaging.rpc.client.RemoteError as re:
+        if ("Endpoint does not support RPC method" in re.message) or \
+                ("Endpoint does not support RPC version" in re.message):
+            sys.exit(0)  # Call reached the service
+        else:
+            sys.stderr.write("Health probe unable to reach service")
+            sys.exit(1)  # return failure
+    except oslo_messaging.exceptions.MessagingTimeout:
+        sys.stderr.write("Health probe timed out. Agent is down or response "
+                         "timed out")
+        sys.exit(1)  # return failure
+    except Exception as ex:
+        sys.stderr.write("Health probe caught exception sending message to "
+                         "service: %s" % ex.message)
+        sys.exit(0)
+    except:
+        sys.stderr.write("Health probe caught exception sending message to"
+                         " service")
+        sys.exit(0)
+
+
+def tcp_socket_status(process, port):
+    """Check the tcp socket status on a process"""
+    sock_count = 0
+    parentId = 0
+    for pr in psutil.pids():
+        try:
+            p = psutil.Process(pr)
+            if p.name() == process:
+                if parentId == 0:
+                    parentId = p.pid
+                else:
+                    if p.ppid() == parentId:
+                        continue
+                pcon = p.connections()
+                for con in pcon:
+                    try:
+                        rport = con.raddr[1]
+                        status = con.status
+                    except IndexError:
+                        continue
+                    if rport == port and status == tcp_established:
+                        sock_count = sock_count + 1
+        except psutil.NoSuchProcess:
+            continue
+
+    if sock_count == 0:
+        return 0
+    else:
+        return 1
+
+
+def configured_port_in_conf():
+    """Get the rabbitmq/Database port configured in config file"""
+    rabbitmq_port = 0
+    database_port = 0
+    try:
+        with open(sys.argv[2]) as conf_file:
+            for line in conf_file:
+                if "transport_url" in line:
+                    rabbitmq_port = int(line.split(':', 3)[3].split('/')[0])
+                elif "connection =" in line:
+                    service = line.split(':', 3)[3].split('/')[1].rstrip('\n')
+                    if service == "nova":
+                        database_port = int(
+                            line.split(':', 3)[3].split('/')[0])
+            return rabbitmq_port, database_port
+    except IOError:
+        sys.stderr.write("Nova Config file not present")
+        sys.exit(1)
+
+
+def test_tcp_socket(service):
+    """Check tcp socket to rabbitmq/db is in Established state"""
+    dict_services = {
+        "compute": "nova-compute",
+        "conductor": "nova-conductor",
+        "consoleauth": "nova-consoleaut",
+        "scheduler": "nova-scheduler"
+    }
+    r_port, d_port = configured_port_in_conf()
+
+    if service in dict_services:
+        proc = dict_services[service]
+        if r_port != 0 and tcp_socket_status(proc, r_port) == 0:
+            sys.stderr.write("RabbitMQ socket not established")
+            # Do not kill the pod if RabbitMQ is not reachable/down
+            if not cfg.CONF.liveness_probe:
+                sys.exit(1)
+
+        # let's do the db check
+        if service != "compute":
+            if d_port != 0 and tcp_socket_status(proc, d_port) == 0:
+                sys.stderr.write("Database socket not established")
+                # Do not kill the pod if database is not reachable/down
+                # there could be no socket as well as typically connections
+                # get closed after an idle timeout
+                # Just log it to pod events
+                if not cfg.CONF.liveness_probe:
+                    sys.exit(1)
+
+
+def test_rpc_liveness():
+    """Test if service can consume message from queue"""
+    oslo_messaging.set_transport_defaults(control_exchange='nova')
+
+    rabbit_group = cfg.OptGroup(name='oslo_messaging_rabbit',
+                                title='RabbitMQ options')
+    cfg.CONF.register_group(rabbit_group)
+    cfg.CONF.register_cli_opt(cfg.StrOpt('service-queue-name'))
+    cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False,
+                                          required=False))
+
+    cfg.CONF(sys.argv[1:])
+
+    log.logging.basicConfig(level=log.ERROR)
+
+    try:
+        transport = oslo_messaging.get_transport(cfg.CONF)
+    except Exception as ex:
+        sys.stderr.write("Message bus driver load error: %s" % ex.message)
+        sys.exit(0)  # return success
+
+    if not cfg.CONF.transport_url or \
+            not cfg.CONF.service_queue_name:
+        sys.stderr.write("Both message bus URL and service's queue name are "
+                         "required for health probe to work")
+        sys.exit(0)  # return success
+
+    try:
+        cfg.CONF.set_override('rabbit_max_retries', 2,
+                              group=rabbit_group)  # 3 attempts
+    except cfg.NoSuchOptError as ex:
+        cfg.CONF.register_opt(cfg.IntOpt('rabbit_max_retries', default=2),
+                              group=rabbit_group)
+
+    service = cfg.CONF.service_queue_name
+    test_tcp_socket(service)
+
+    check_service_status(transport)
+
+
+if __name__ == "__main__":
+    test_rpc_liveness()
+
+    sys.exit(0)  # return success
diff --git a/nova/templates/configmap-bin.yaml b/nova/templates/configmap-bin.yaml
index e422b62196..c58b90bd7e 100644
--- a/nova/templates/configmap-bin.yaml
+++ b/nova/templates/configmap-bin.yaml
@@ -51,6 +51,8 @@ data:
   ceph-admin-keyring.sh: |
 {{ tuple "bin/_ceph-admin-keyring.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
 {{- end }}
+  health-probe.py: |
+{{ tuple "bin/_health-probe.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
   nova-api.sh: |
 {{ tuple "bin/_nova-api.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
   nova-api-metadata.sh: |
diff --git a/nova/templates/daemonset-compute.yaml b/nova/templates/daemonset-compute.yaml
index 463ea72bb7..35236bdadc 100644
--- a/nova/templates/daemonset-compute.yaml
+++ b/nova/templates/daemonset-compute.yaml
@@ -180,6 +180,31 @@ spec:
             - name: LIBVIRT_CEPH_SECRET_UUID
               value: "{{ .Values.conf.ceph.secret_uuid }}"
           {{ end }}
+          readinessProbe:
+            exec:
+              command:
+              - python
+              - /tmp/health-probe.py
+              - --config-file
+              - /etc/nova/nova.conf
+              - --service-queue-name
+              - compute
+            initialDelaySeconds: 80
+            periodSeconds: 90
+            timeoutSeconds: 70
+          livenessProbe:
+            exec:
+              command:
+              - python
+              - /tmp/health-probe.py
+              - --config-file
+              - /etc/nova/nova.conf
+              - --service-queue-name
+              - compute
+              - --liveness-probe
+            initialDelaySeconds: 120
+            periodSeconds: 90
+            timeoutSeconds: 70
           command:
             - /tmp/nova-compute.sh
           volumeMounts:
@@ -187,6 +212,10 @@ spec:
               mountPath: /tmp/nova-compute.sh
               subPath: nova-compute.sh
               readOnly: true
+            - name: nova-bin
+              mountPath: /tmp/health-probe.py
+              subPath: health-probe.py
+              readOnly: true
             - name: nova-etc
               mountPath: /etc/nova/nova.conf
               subPath: nova.conf
diff --git a/nova/templates/deployment-conductor.yaml b/nova/templates/deployment-conductor.yaml
index 33de6413c5..1e66e41932 100644
--- a/nova/templates/deployment-conductor.yaml
+++ b/nova/templates/deployment-conductor.yaml
@@ -60,6 +60,31 @@ spec:
 {{ tuple $envAll $envAll.Values.pod.resources.conductor | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
           securityContext:
             allowPrivilegeEscalation: false
+          readinessProbe:
+            exec:
+              command:
+              - python
+              - /tmp/health-probe.py
+              - --config-file
+              - /etc/nova/nova.conf
+              - --service-queue-name
+              - conductor
+            initialDelaySeconds: 80
+            periodSeconds: 90
+            timeoutSeconds: 70
+          livenessProbe:
+            exec:
+              command:
+              - python
+              - /tmp/health-probe.py
+              - --config-file
+              - /etc/nova/nova.conf
+              - --service-queue-name
+              - conductor
+              - --liveness-probe
+            initialDelaySeconds: 120
+            periodSeconds: 90
+            timeoutSeconds: 70
           command:
             - /tmp/nova-conductor.sh
           volumeMounts:
@@ -67,6 +92,10 @@ spec:
               mountPath: /tmp/nova-conductor.sh
               subPath: nova-conductor.sh
               readOnly: true
+            - name: nova-bin
+              mountPath: /tmp/health-probe.py
+              subPath: health-probe.py
+              readOnly: true
             - name: nova-etc
               mountPath: /etc/nova/nova.conf
               subPath: nova.conf
diff --git a/nova/templates/deployment-consoleauth.yaml b/nova/templates/deployment-consoleauth.yaml
index 29832d56a6..75b66e7939 100644
--- a/nova/templates/deployment-consoleauth.yaml
+++ b/nova/templates/deployment-consoleauth.yaml
@@ -60,6 +60,31 @@ spec:
 {{ tuple $envAll $envAll.Values.pod.resources.consoleauth | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
           securityContext:
             allowPrivilegeEscalation: false
+          readinessProbe:
+            exec:
+              command:
+              - python
+              - /tmp/health-probe.py
+              - --config-file
+              - /etc/nova/nova.conf
+              - --service-queue-name
+              - consoleauth
+            initialDelaySeconds: 80
+            periodSeconds: 90
+            timeoutSeconds: 70
+          livenessProbe:
+            exec:
+              command:
+              - python
+              - /tmp/health-probe.py
+              - --config-file
+              - /etc/nova/nova.conf
+              - --service-queue-name
+              - consoleauth
+              - --liveness-probe
+            initialDelaySeconds: 120
+            periodSeconds: 90
+            timeoutSeconds: 70
           command:
             - /tmp/nova-consoleauth.sh
           volumeMounts:
@@ -67,6 +92,10 @@ spec:
               mountPath: /tmp/nova-consoleauth.sh
               subPath: nova-consoleauth.sh
               readOnly: true
+            - name: nova-bin
+              mountPath: /tmp/health-probe.py
+              subPath: health-probe.py
+              readOnly: true
             - name: nova-etc
               mountPath: /etc/nova/nova.conf
               subPath: nova.conf
diff --git a/nova/templates/deployment-novncproxy.yaml b/nova/templates/deployment-novncproxy.yaml
index 8d187c8b88..cf9fda0243 100644
--- a/nova/templates/deployment-novncproxy.yaml
+++ b/nova/templates/deployment-novncproxy.yaml
@@ -94,6 +94,14 @@ spec:
         - name: nova-novncproxy
 {{ tuple $envAll "nova_novncproxy" | include "helm-toolkit.snippets.image" | indent 10 }}
 {{ tuple $envAll $envAll.Values.pod.resources.novncproxy | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
+          readinessProbe:
+            tcpSocket:
+              port: {{ tuple "compute_novnc_proxy" "internal" "novnc_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
+            initialDelaySeconds: 30
+          livenessProbe:
+            tcpSocket:
+              port: {{ tuple "compute_novnc_proxy" "internal" "novnc_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
+            initialDelaySeconds: 30
           command:
             - /tmp/nova-console-proxy.sh
           ports:
diff --git a/nova/templates/deployment-scheduler.yaml b/nova/templates/deployment-scheduler.yaml
index a3d46e5db0..9611d9509f 100644
--- a/nova/templates/deployment-scheduler.yaml
+++ b/nova/templates/deployment-scheduler.yaml
@@ -60,6 +60,31 @@ spec:
 {{ tuple $envAll $envAll.Values.pod.resources.scheduler | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
           securityContext:
             allowPrivilegeEscalation: false
+          readinessProbe:
+            exec:
+              command:
+              - python
+              - /tmp/health-probe.py
+              - --config-file
+              - /etc/nova/nova.conf
+              - --service-queue-name
+              - scheduler
+            initialDelaySeconds: 80
+            periodSeconds: 90
+            timeoutSeconds: 70
+          livenessProbe:
+            exec:
+              command:
+              - python
+              - /tmp/health-probe.py
+              - --config-file
+              - /etc/nova/nova.conf
+              - --service-queue-name
+              - scheduler
+              - --liveness-probe
+            initialDelaySeconds: 120
+            periodSeconds: 90
+            timeoutSeconds: 70
           command:
             - /tmp/nova-scheduler.sh
           volumeMounts:
@@ -67,6 +92,10 @@ spec:
               mountPath: /tmp/nova-scheduler.sh
               subPath: nova-scheduler.sh
               readOnly: true
+            - name: nova-bin
+              mountPath: /tmp/health-probe.py
+              subPath: health-probe.py
+              readOnly: true
             - name: nova-etc
               mountPath: /etc/nova/nova.conf
               subPath: nova.conf
diff --git a/nova/templates/deployment-spiceproxy.yaml b/nova/templates/deployment-spiceproxy.yaml
index b026d753ea..4507bde4ce 100644
--- a/nova/templates/deployment-spiceproxy.yaml
+++ b/nova/templates/deployment-spiceproxy.yaml
@@ -94,6 +94,14 @@ spec:
         - name: nova-spiceproxy
 {{ tuple $envAll "nova_spiceproxy" | include "helm-toolkit.snippets.image" | indent 10 }}
 {{ tuple $envAll $envAll.Values.pod.resources.spiceproxy | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
+          readinessProbe:
+            tcpSocket:
+              port: {{ tuple "compute_spice_proxy" "internal" "spice_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
+            initialDelaySeconds: 30
+          livenessProbe:
+            tcpSocket:
+              port: {{ tuple "compute_spice_proxy" "internal" "spice_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
+            initialDelaySeconds: 30
           command:
             - /tmp/nova-console-proxy.sh
           ports: