Fix ceph-client helm test
This patch resolves a helm test problem where the test was failing if it found a PG state of "activating". It could also potentially find a number of other states, like premerge or unknown, that could also fail the test. Note that if these transient PG states are found for more than 3 minutes, the helm test fails. Change-Id: I071bcfedf7e4079e085c2f72d2fbab3adc0b027c
This commit is contained in:
parent
43226de6e3
commit
167b9eb1a8
@ -15,6 +15,6 @@ apiVersion: v1
|
|||||||
appVersion: v1.0.0
|
appVersion: v1.0.0
|
||||||
description: OpenStack-Helm Ceph Client
|
description: OpenStack-Helm Ceph Client
|
||||||
name: ceph-client
|
name: ceph-client
|
||||||
version: 0.1.12
|
version: 0.1.13
|
||||||
home: https://github.com/ceph/ceph-client
|
home: https://github.com/ceph/ceph-client
|
||||||
...
|
...
|
||||||
|
@ -261,7 +261,10 @@ function check_pgs() {
|
|||||||
# Not a critical error - yet
|
# Not a critical error - yet
|
||||||
pgs_transitioning=true
|
pgs_transitioning=true
|
||||||
else
|
else
|
||||||
ceph --cluster ${CLUSTER} pg ls -f json-pretty | grep '"pgid":\|"state":' | grep -v "active" | grep -B1 '"state":' > ${inactive_pgs_file} || true
|
# Examine the PGs that have non-active states. Consider those PGs that
|
||||||
|
# are in a "premerge" state to be similar to active. "premerge" PGs may
|
||||||
|
# stay in that state for several minutes, and this is considered ok.
|
||||||
|
ceph --cluster ${CLUSTER} pg ls -f json-pretty | grep '"pgid":\|"state":' | grep -v -E "active|premerge" | grep -B1 '"state":' > ${inactive_pgs_file} || true
|
||||||
|
|
||||||
# If the inactive pgs file is non-empty, there are some inactive pgs in the cluster.
|
# If the inactive pgs file is non-empty, there are some inactive pgs in the cluster.
|
||||||
inactive_pgs=(`cat ${inactive_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
|
inactive_pgs=(`cat ${inactive_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
|
||||||
@ -270,6 +273,7 @@ function check_pgs() {
|
|||||||
|
|
||||||
echo "Very likely the cluster is rebalancing or recovering some PG's. Checking..."
|
echo "Very likely the cluster is rebalancing or recovering some PG's. Checking..."
|
||||||
|
|
||||||
|
# Check for PGs that are down. These are critical errors.
|
||||||
down_pgs=(`cat ${inactive_pgs_file} | grep -B1 'down' | awk -F "\"" '/pgid/{print $4}'`)
|
down_pgs=(`cat ${inactive_pgs_file} | grep -B1 'down' | awk -F "\"" '/pgid/{print $4}'`)
|
||||||
if [[ ${#down_pgs[*]} -gt 0 ]]; then
|
if [[ ${#down_pgs[*]} -gt 0 ]]; then
|
||||||
# Some PGs could be down. This is really bad situation and test must fail.
|
# Some PGs could be down. This is really bad situation and test must fail.
|
||||||
@ -279,23 +283,32 @@ function check_pgs() {
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
non_peer_recover_pgs=(`cat ${inactive_pgs_file} | grep '"state":' | grep -v -E 'peer|recover' || true`)
|
# Check for PGs that are in some transient state due to rebalancing,
|
||||||
if [[ ${#non_peer_recover_pgs[*]} -gt 0 ]]; then
|
# peering or backfilling. If we see other states which are not in the
|
||||||
|
# following list of states, then we likely have a problem and need to
|
||||||
|
# exit.
|
||||||
|
transient_states='peer|recover|activating|creating|unknown'
|
||||||
|
non_transient_pgs=(`cat ${inactive_pgs_file} | grep '"state":' | grep -v -E "${transient_states}" || true`)
|
||||||
|
if [[ ${#non_transient_pgs[*]} -gt 0 ]]; then
|
||||||
# Some PGs could be inactive and not peering. Better we fail.
|
# Some PGs could be inactive and not peering. Better we fail.
|
||||||
echo "We are unsure what's happening: we don't have down/stuck PGs,"
|
echo "We don't have down/stuck PGs, but we have some inactive pgs that"
|
||||||
echo "but we have some inactive pgs that are not peering/recover: "
|
echo "are not in the list of allowed transient states: "
|
||||||
pg_list=(`sed -n '/recover\|peer/{s/.*//;x;d;};x;p;${x;p;}' ${inactive_pgs_file} | sed '/^$/d' | awk -F "\"" '/pgid/{print $4}'`)
|
pg_list=(`sed -n '/peer\|recover\|activating\|creating\|unknown/{s/.*//;x;d;};x;p;${x;p;}' ${inactive_pgs_file} | sed '/^$/d' | awk -F "\"" '/pgid/{print $4}'`)
|
||||||
echo ${pg_list[*]}
|
echo ${pg_list[*]}
|
||||||
|
echo ${non_transient_pgs[*]}
|
||||||
# Critical error. Fail/exit the script
|
# Critical error. Fail/exit the script
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
peer_recover_pgs=(`cat ${inactive_pgs_file} | grep -B1 -E 'peer|recover' | awk -F "\"" '/pgid/{print $4}'`)
|
# Check and note which PGs are in a transient state. This script
|
||||||
if [[ ${#peer_recover_pgs[*]} -gt 0 ]]; then
|
# will allow these transient states for a period of time
|
||||||
|
# (time_between_retries * max_retries seconds).
|
||||||
|
transient_pgs=(`cat ${inactive_pgs_file} | grep -B1 -E "${transient_states}" | awk -F "\"" '/pgid/{print $4}'`)
|
||||||
|
if [[ ${#transient_pgs[*]} -gt 0 ]]; then
|
||||||
# Some PGs are not in an active state but peering and/or cluster is recovering
|
# Some PGs are not in an active state but peering and/or cluster is recovering
|
||||||
echo "Some PGs are peering and/or cluster is recovering: "
|
echo "Some PGs are peering and/or cluster is recovering: "
|
||||||
echo ${peer_recover_pgs[*]}
|
echo ${transient_pgs[*]}
|
||||||
echo "This is normal but will wait a while to verify the PGs are not stuck in peering."
|
echo "This is normal but will wait a while to verify the PGs are not stuck in a transient state."
|
||||||
# not critical, just wait
|
# not critical, just wait
|
||||||
pgs_transitioning=true
|
pgs_transitioning=true
|
||||||
fi
|
fi
|
||||||
|
@ -13,4 +13,5 @@ ceph-client:
|
|||||||
- 0.1.10 Separate pool quotas from pg_num calculations
|
- 0.1.10 Separate pool quotas from pg_num calculations
|
||||||
- 0.1.11 enhance logic to enable and disable the autoscaler
|
- 0.1.11 enhance logic to enable and disable the autoscaler
|
||||||
- 0.1.12 Disable autoscaling before pools are created
|
- 0.1.12 Disable autoscaling before pools are created
|
||||||
|
- 0.1.13 Fix ceph-client helm test
|
||||||
...
|
...
|
||||||
|
Loading…
x
Reference in New Issue
Block a user