From e581a79807721bf100c3c8e429474af21c28633e Mon Sep 17 00:00:00 2001 From: Matthew Heler Date: Thu, 27 Dec 2018 10:22:38 -0600 Subject: [PATCH] [CEPH] Cleanup the ceph-osd helm-chart - Split off duplicate code across multiple bash scripts into a common file. - Simplify the way journals are detected for block devices. - Cleanup unused portions of the code. - Standardize the syntax across all the code. - Use sgdisk for zapping disks rather then ceph-disk. Change-Id: I13e4a89cab3ee454dd36b5cdedfa2f341bf50b87 --- ceph-osd/templates/bin/osd/_block.sh.tpl | 146 +++-------------- ceph-osd/templates/bin/osd/_common.sh.tpl | 159 +++++++++++++++++++ ceph-osd/templates/bin/osd/_directory.sh.tpl | 110 ++++--------- ceph-osd/templates/bin/osd/_init.sh.tpl | 94 ++--------- ceph-osd/templates/bin/osd/_start.sh.tpl | 13 -- ceph-osd/templates/configmap-bin.yaml | 2 + ceph-osd/templates/daemonset-osd.yaml | 20 +++ 7 files changed, 248 insertions(+), 296 deletions(-) create mode 100644 ceph-osd/templates/bin/osd/_common.sh.tpl diff --git a/ceph-osd/templates/bin/osd/_block.sh.tpl b/ceph-osd/templates/bin/osd/_block.sh.tpl index 6e78e36a1d..9cfd344025 100644 --- a/ceph-osd/templates/bin/osd/_block.sh.tpl +++ b/ceph-osd/templates/bin/osd/_block.sh.tpl @@ -16,19 +16,12 @@ See the License for the specific language governing permissions and limitations under the License. */}} +source /tmp/osd-common.sh + set -ex -: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}" -: "${OSD_JOURNAL_UUID:=$(uuidgen)}" -: "${CRUSH_LOCATION:=root=default host=${HOSTNAME}}" -: "${OSD_PATH_BASE:=/var/lib/ceph/osd/${CLUSTER}}" : "${OSD_SOFT_FORCE_ZAP:=1}" -: "${OSD_JOURNAL_PARTITION:=}" - -eval OSD_PG_INTERVAL_FIX=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["osd_pg_interval_fix"]))') -eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))') -eval CRUSH_FAILURE_DOMAIN_NAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_name"]))') -eval CRUSH_FAILURE_DOMAIN_BY_HOSTNAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_by_hostname"]))') +: "${OSD_JOURNAL_DISK:=}" if [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then export OSD_DEVICE="/var/lib/ceph/osd" @@ -42,80 +35,23 @@ else export OSD_JOURNAL=$(readlink -f ${JOURNAL_LOCATION}) fi -if [[ ! -e /etc/ceph/${CLUSTER}.conf ]]; then - echo "ERROR- /etc/ceph/${CLUSTER}.conf must exist; get it from your existing mon" - exit 1 -fi - if [[ -z "${OSD_DEVICE}" ]];then echo "ERROR- You must provide a device to build your OSD ie: /dev/sdb" exit 1 fi if [[ ! -b "${OSD_DEVICE}" ]]; then - echo "ERROR- The device pointed by OSD_DEVICE ($OSD_DEVICE) doesn't exist !" + echo "ERROR- The device pointed by OSD_DEVICE ${OSD_DEVICE} doesn't exist !" exit 1 fi -# Calculate proper device names, given a device and partition number -function dev_part { - local osd_device=${1} - local osd_partition=${2} - - if [[ -L ${osd_device} ]]; then - # This device is a symlink. Work out it's actual device - local actual_device - actual_device=$(readlink -f "${osd_device}") - if [[ "${actual_device:0-1:1}" == [0-9] ]]; then - local desired_partition="${actual_device}p${osd_partition}" - else - local desired_partition="${actual_device}${osd_partition}" - fi - # Now search for a symlink in the directory of $osd_device - # that has the correct desired partition, and the longest - # shared prefix with the original symlink - local symdir - symdir=$(dirname "${osd_device}") - local link="" - local pfxlen=0 - for option in ${symdir}/*; do - [[ -e $option ]] || break - if [[ $(readlink -f "$option") == "$desired_partition" ]]; then - local optprefixlen - optprefixlen=$(prefix_length "$option" "$osd_device") - if [[ $optprefixlen > $pfxlen ]]; then - link=$option - pfxlen=$optprefixlen - fi - fi - done - if [[ $pfxlen -eq 0 ]]; then - >&2 echo "Could not locate appropriate symlink for partition ${osd_partition} of ${osd_device}" - exit 1 - fi - echo "$link" - elif [[ "${osd_device:0-1:1}" == [0-9] ]]; then - echo "${osd_device}p${osd_partition}" - else - echo "${osd_device}${osd_partition}" - fi -} - CEPH_DISK_OPTIONS="" CEPH_OSD_OPTIONS="" - DATA_UUID=$(blkid -o value -s PARTUUID ${OSD_DEVICE}*1) -LOCKBOX_UUID=$(blkid -o value -s PARTUUID ${OSD_DEVICE}3 || true) -JOURNAL_PART=$(dev_part ${OSD_DEVICE} 2) # watch the udev event queue, and exit if all current events are handled udevadm settle --timeout=600 -# Wait for a file to exist, regardless of the type -function wait_for_file { - timeout 10 bash -c "while [ ! -e ${1} ]; do echo 'Waiting for ${1} to show up' && sleep 1 ; done" -} - DATA_PART=$(dev_part ${OSD_DEVICE} 1) MOUNTED_PART=${DATA_PART} @@ -131,85 +67,41 @@ OSD_PATH="${OSD_PATH_BASE}-${OSD_ID}" OSD_KEYRING="${OSD_PATH}/keyring" # NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing OSD_WEIGHT=0 -function crush_create_or_move { - local crush_location=${1} - ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location} || true -} -function crush_add_and_move { - local crush_failure_domain_type=${1} - local crush_failure_domain_name=${2} - local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}") - crush_create_or_move "${crush_location}" - local crush_failure_domain_location_check=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}') - if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ]; then - # NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations - # as create-or-move may not appropiately move them. - ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true - ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush move "${crush_failure_domain_name}" root=default || true - ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true - fi -} -if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "xhost" ]; then - if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then - crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}" - elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then - crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))" - else - # NOTE(supamatt): neither variables are defined then we fall back to expected default behavior - crush_create_or_move "${CRUSH_LOCATION}" - fi -else - crush_create_or_move "${CRUSH_LOCATION}" -fi +# NOTE(supamatt): add or move the OSD's CRUSH location +crush_location + if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then if [ -n "${OSD_JOURNAL}" ]; then if [ -b "${OSD_JOURNAL}" ]; then - OSD_JOURNAL_PARTITION="$(echo "${OSD_JOURNAL_PARTITION}" | sed 's/[^0-9]//g')" - if [ -z "${OSD_JOURNAL_PARTITION}" ]; then - # maybe they specified the journal as a /dev path like '/dev/sdc12': - JDEV="$(echo "${OSD_JOURNAL}" | sed 's/\(.*[^0-9]\)[0-9]*$/\1/')" - if [ -d "/sys/block/$(basename "${JDEV}")/$(basename "${OSD_JOURNAL}")" ]; then - OSD_JOURNAL="$(dev_part "${JDEV}" "$(echo "${OSD_JOURNAL}" | sed 's/.*[^0-9]\([0-9]*\)$/\1/')")" - elif [ "${OSD_DEVICE}" == "${OSD_JOURNAL}" ]; then - # journal and osd disk are on the same device. - OSD_JOURNAL="$(dev_part "${OSD_JOURNAL}" 2)" - else - # they likely supplied a bare device and prepare created partition 1. - OSD_JOURNAL="$(dev_part "${OSD_JOURNAL}" 1)" - fi + OSD_JOURNAL_DISK="$(readlink -f ${OSD_PATH}/journal)" + if [ -z "${OSD_JOURNAL_DISK}" ]; then + echo "ERROR: Unable to find journal device ${OSD_JOURNAL_DISK}" + exit 1 else - OSD_JOURNAL="$(dev_part "${OSD_JOURNAL}" "${OSD_JOURNAL_PARTITION}")" + OSD_JOURNAL="${OSD_JOURNAL_DISK}" fi fi if [ "x${JOURNAL_TYPE}" == "xdirectory" ]; then OSD_JOURNAL="${OSD_JOURNAL}/journal.${OSD_ID}" + wait_for_file "${OSD_JOURNAL}" + chown ceph. "${OSD_JOURNAL}" else if [ ! -b "${OSD_JOURNAL}" ]; then echo "ERROR: Unable to find journal device ${OSD_JOURNAL}" exit 1 else - wait_for_file "${OSD_JOURNAL}" - chown ceph. "${OSD_JOURNAL}" "${DATA_PART}" + chown ceph. "${OSD_JOURNAL}" fi fi else - wait_for_file "${JOURNAL_PART}" - chown ceph. "${JOURNAL_PART}" "${DATA_PART}" - OSD_JOURNAL="${JOURNAL_PART}" + wait_for_file "${OSD_JOURNAL}" + chown ceph. "${OSD_JOURNAL}" fi fi if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then - # NOTE(supamatt): https://tracker.ceph.com/issues/21142 is impacting us due to the older Ceph version 12.2.3 that we are running - if [ "x${OSD_PG_INTERVAL_FIX}" == "xtrue" ]; then - for PG in $(ls ${OSD_PATH}/current | awk -F'_' '/head/{print $1}'); do - ceph-objectstore-tool --data-path ${OSD_PATH} --op rm-past-intervals --pgid ${PG}; - done - fi + # NOTE(supamatt): This function is a workaround to Ceph upstream bug #21142 + osd_pg_interval_fix fi if [ "x${JOURNAL_TYPE}" == "xdirectory" ]; then diff --git a/ceph-osd/templates/bin/osd/_common.sh.tpl b/ceph-osd/templates/bin/osd/_common.sh.tpl new file mode 100644 index 0000000000..58861493e9 --- /dev/null +++ b/ceph-osd/templates/bin/osd/_common.sh.tpl @@ -0,0 +1,159 @@ +#!/bin/bash + +{{/* +Copyright 2017 The Openstack-Helm Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +set -ex + +: "${CRUSH_LOCATION:=root=default host=${HOSTNAME}}" +: "${OSD_PATH_BASE:=/var/lib/ceph/osd/${CLUSTER}}" +: "${CEPH_CONF:="/etc/ceph/${CLUSTER}.conf"}" +: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}" +: "${OSD_JOURNAL_UUID:=$(uuidgen)}" + +eval OSD_PG_INTERVAL_FIX=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["osd_pg_interval_fix"]))') +eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))') +eval CRUSH_FAILURE_DOMAIN_NAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_name"]))') +eval CRUSH_FAILURE_DOMAIN_BY_HOSTNAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_by_hostname"]))') + +if [[ $(ceph -v | egrep -q "12.2|luminous"; echo $?) -ne 0 ]]; then + echo "ERROR- need Luminous release" + exit 1 +fi + +if [ -z "${HOSTNAME}" ]; then + echo "HOSTNAME not set; This will prevent to add an OSD into the CRUSH map" + exit 1 +fi + +if [[ ! -e ${CEPH_CONF}.template ]]; then + echo "ERROR- ${CEPH_CONF}.template must exist; get it from your existing mon" + exit 1 +else + ENDPOINT=$(kubectl get endpoints ceph-mon -n ${NAMESPACE} -o json | awk -F'"' -v port=${MON_PORT} '/ip/{print $4":"port}' | paste -sd',') + if [[ ${ENDPOINT} == "" ]]; then + # No endpoints are available, just copy ceph.conf as-is + /bin/sh -c -e "cat ${CEPH_CONF}.template | tee ${CEPH_CONF}" || true + else + /bin/sh -c -e "cat ${CEPH_CONF}.template | sed 's/mon_host.*/mon_host = ${ENDPOINT}/g' | tee ${CEPH_CONF}" || true + fi +fi + +# Wait for a file to exist, regardless of the type +function wait_for_file { + timeout 10 bash -c "while [ ! -e ${1} ]; do echo 'Waiting for ${1} to show up' && sleep 1 ; done" +} + +function is_available { + command -v $@ &>/dev/null +} + +function crush_create_or_move { + local crush_location=${1} + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location} || true +} + +function crush_add_and_move { + local crush_failure_domain_type=${1} + local crush_failure_domain_name=${2} + local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}") + crush_create_or_move "${crush_location}" + local crush_failure_domain_location_check=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}') + if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ]; then + # NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations + # as create-or-move may not appropiately move them. + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush move "${crush_failure_domain_name}" root=default || true + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true + fi +} + +function crush_location { + if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "xhost" ]; then + if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then + crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}" + elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then + crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))" + else + # NOTE(supamatt): neither variables are defined then we fall back to default behavior + crush_create_or_move "${CRUSH_LOCATION}" + fi + else + crush_create_or_move "${CRUSH_LOCATION}" + fi +} + +# Calculate proper device names, given a device and partition number +function dev_part { + local osd_device=${1} + local osd_partition=${2} + + if [[ -L ${osd_device} ]]; then + # This device is a symlink. Work out it's actual device + local actual_device=$(readlink -f "${osd_device}") + local bn=$(basename "${osd_device}") + if [[ "${actual_device:0-1:1}" == [0-9] ]]; then + local desired_partition="${actual_device}p${osd_partition}" + else + local desired_partition="${actual_device}${osd_partition}" + fi + # Now search for a symlink in the directory of $osd_device + # that has the correct desired partition, and the longest + # shared prefix with the original symlink + local symdir=$(dirname "${osd_device}") + local link="" + local pfxlen=0 + for option in ${symdir}/*; do + [[ -e $option ]] || break + if [[ $(readlink -f "${option}") == "${desired_partition}" ]]; then + local optprefixlen=$(prefix_length "${option}" "${bn}") + if [[ ${optprefixlen} > ${pfxlen} ]]; then + link=${symdir}/${option} + pfxlen=${optprefixlen} + fi + fi + done + if [[ $pfxlen -eq 0 ]]; then + >&2 echo "Could not locate appropriate symlink for partition ${osd_partition} of ${osd_device}" + exit 1 + fi + echo "$link" + elif [[ "${osd_device:0-1:1}" == [0-9] ]]; then + echo "${osd_device}p${osd_partition}" + else + echo "${osd_device}${osd_partition}" + fi +} + +function osd_pg_interval_fix { + # NOTE(supamatt): https://tracker.ceph.com/issues/21142 is impacting us due to the older Ceph version 12.2.3 that we are running + if [ "x${OSD_PG_INTERVAL_FIX}" == "xtrue" ]; then + for PG in $(ls ${OSD_PATH}/current | awk -F'_' '/head/{print $1}'); do + ceph-objectstore-tool --data-path ${OSD_PATH} --op rm-past-intervals --pgid ${PG}; + done + fi +} + +function udev_settle { + partprobe "${OSD_DEVICE}" + # watch the udev event queue, and exit if all current events are handled + udevadm settle --timeout=600 +} + diff --git a/ceph-osd/templates/bin/osd/_directory.sh.tpl b/ceph-osd/templates/bin/osd/_directory.sh.tpl index 417bd61e66..00e504b9f4 100644 --- a/ceph-osd/templates/bin/osd/_directory.sh.tpl +++ b/ceph-osd/templates/bin/osd/_directory.sh.tpl @@ -1,32 +1,27 @@ #!/bin/bash + +{{/* +Copyright 2017 The Openstack-Helm Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + set -ex export LC_ALL=C -: "${HOSTNAME:=$(uname -n)}" -: "${CRUSH_LOCATION:=root=default host=${HOSTNAME}}" -: "${OSD_PATH_BASE:=/var/lib/ceph/osd/${CLUSTER}}" + +source /tmp/osd-common.sh + : "${JOURNAL_DIR:=/var/lib/ceph/journal}" -: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}" - -eval OSD_PG_INTERVAL_FIX=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["osd_pg_interval_fix"]))') -eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))') -eval CRUSH_FAILURE_DOMAIN_NAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_name"]))') -eval CRUSH_FAILURE_DOMAIN_BY_HOSTNAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_by_hostname"]))') - -function is_available { - command -v $@ &>/dev/null -} -if is_available rpm; then - OS_VENDOR=redhat - source /etc/sysconfig/ceph -elif is_available dpkg; then - OS_VENDOR=ubuntu - source /etc/default/ceph -fi - -if [[ $(ceph -v | egrep -q "12.2|luminous"; echo $?) -ne 0 ]]; then - echo "ERROR- need Luminous release" - exit 1 -fi if [[ ! -d /var/lib/ceph/osd ]]; then echo "ERROR- could not find the osd directory, did you bind mount the OSD data directory?" @@ -34,11 +29,6 @@ if [[ ! -d /var/lib/ceph/osd ]]; then exit 1 fi -if [ -z "${HOSTNAME}" ]; then - echo "HOSTNAME not set; This will prevent to add an OSD into the CRUSH map" - exit 1 -fi - # check if anything is present, if not, create an osd and its directory if [[ -n "$(find /var/lib/ceph/osd -prune -empty)" ]]; then echo "Creating osd" @@ -56,14 +46,14 @@ if [[ -n "$(find /var/lib/ceph/osd -prune -empty)" ]]; then OSD_PATH="$OSD_PATH_BASE-$OSD_ID/" if [ -n "${JOURNAL_DIR}" ]; then - OSD_J="${JOURNAL_DIR}/journal.${OSD_ID}" + OSD_JOURNAL="${JOURNAL_DIR}/journal.${OSD_ID}" chown -R ceph. ${JOURNAL_DIR} else if [ -n "${JOURNAL}" ]; then - OSD_J=${JOURNAL} + OSD_JOURNAL=${JOURNAL} chown -R ceph. $(dirname ${JOURNAL_DIR}) else - OSD_J=${OSD_PATH%/}/journal + OSD_JOURNAL=${OSD_PATH%/}/journal fi fi # create the folder and own it @@ -74,57 +64,21 @@ if [[ -n "$(find /var/lib/ceph/osd -prune -empty)" ]]; then ceph-authtool --create-keyring ${OSD_PATH%/}/keyring --name osd.${OSD_ID} --add-key ${OSD_SECRET} OSD_KEYRING="${OSD_PATH%/}/keyring" # init data directory - ceph-osd -i ${OSD_ID} --mkfs --osd-uuid ${UUID} --mkjournal --osd-journal ${OSD_J} --setuser ceph --setgroup ceph + ceph-osd -i ${OSD_ID} --mkfs --osd-uuid ${UUID} --mkjournal --osd-journal ${OSD_JOURNAL} --setuser ceph --setgroup ceph # add the osd to the crush map # NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing OSD_WEIGHT=0 - function crush_create_or_move { - local crush_location=${1} - ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location} || true - } - function crush_add_and_move { - local crush_failure_domain_type=${1} - local crush_failure_domain_name=${2} - local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}") - crush_create_or_move "${crush_location}" - local crush_failure_domain_location_check=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}') - if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ]; then - # NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations - # as create-or-move may not appropiately move them. - ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true - ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush move "${crush_failure_domain_name}" root=default || true - ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true - fi - } - if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "xhost" ]; then - if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then - crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}" - elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then - crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))" - else - # NOTE(supamatt): neither variables are defined then we fall back to default behavior - crush_create_or_move "${CRUSH_LOCATION}" - fi - else - crush_create_or_move "${CRUSH_LOCATION}" - fi + # NOTE(supamatt): add or move the OSD's CRUSH location + crush_location fi +# NOTE(supamatt): This function is a workaround to Ceph upstream bug #21142 +osd_pg_interval_fix + # create the directory and an empty Procfile mkdir -p /etc/forego/"${CLUSTER}" echo "" > /etc/forego/"${CLUSTER}"/Procfile -# NOTE(supamatt): https://tracker.ceph.com/issues/21142 is impacting us due to the older Ceph version 12.2.3 that we are running -if [ "x${OSD_PG_INTERVAL_FIX}" == "xtrue" ]; then - for PG in $(ls ${OSD_PATH}/current | awk -F'_' '/head/{print $1}'); do - ceph-objectstore-tool --data-path ${OSD_PATH} --op rm-past-intervals --pgid ${PG}; - done -fi - for OSD_ID in $(ls /var/lib/ceph/osd | sed 's/.*-//'); do OSD_PATH="$OSD_PATH_BASE-$OSD_ID/" OSD_KEYRING="${OSD_PATH%/}/keyring" @@ -133,16 +87,16 @@ for OSD_ID in $(ls /var/lib/ceph/osd | sed 's/.*-//'); do chown -R ceph. ${JOURNAL_DIR} else if [ -n "${JOURNAL}" ]; then - OSD_J=${JOURNAL} + OSD_JOURNAL=${JOURNAL} chown -R ceph. $(dirname ${JOURNAL_DIR}) else - OSD_J=${OSD_PATH%/}/journal + OSD_JOURNAL=${OSD_PATH%/}/journal fi fi # log osd filesystem type FS_TYPE=`stat --file-system -c "%T" ${OSD_PATH}` echo "OSD $OSD_PATH filesystem type: $FS_TYPE" - echo "${CLUSTER}-${OSD_ID}: /usr/bin/ceph-osd --cluster ${CLUSTER} -f -i ${OSD_ID} --osd-journal ${OSD_J} -k $OSD_KEYRING" | tee -a /etc/forego/"${CLUSTER}"/Procfile + echo "${CLUSTER}-${OSD_ID}: /usr/bin/ceph-osd --cluster ${CLUSTER} -f -i ${OSD_ID} --osd-journal ${OSD_JOURNAL} -k $OSD_KEYRING" | tee -a /etc/forego/"${CLUSTER}"/Procfile done exec /usr/local/bin/forego start -f /etc/forego/"${CLUSTER}"/Procfile diff --git a/ceph-osd/templates/bin/osd/_init.sh.tpl b/ceph-osd/templates/bin/osd/_init.sh.tpl index 8891333fec..311db4ab04 100644 --- a/ceph-osd/templates/bin/osd/_init.sh.tpl +++ b/ceph-osd/templates/bin/osd/_init.sh.tpl @@ -18,26 +18,12 @@ limitations under the License. set -ex -: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}" -: "${OSD_JOURNAL_UUID:=$(uuidgen)}" +source /tmp/osd-common.sh + : "${OSD_FORCE_ZAP:=1}" -: "${CEPH_CONF:="/etc/ceph/${CLUSTER}.conf"}" # We do not want to zap journal disk. Tracking this option seperatly. : "${JOURNAL_FORCE_ZAP:=0}" -if [[ ! -e ${CEPH_CONF}.template ]]; then - echo "ERROR- ${CEPH_CONF}.template must exist; get it from your existing mon" - exit 1 -else - ENDPOINT=$(kubectl get endpoints ceph-mon -n ${NAMESPACE} -o json | awk -F'"' -v port=${MON_PORT} '/ip/{print $4":"port}' | paste -sd',') - if [[ ${ENDPOINT} == "" ]]; then - # No endpoints are available, just copy ceph.conf as-is - /bin/sh -c -e "cat ${CEPH_CONF}.template | tee ${CEPH_CONF}" || true - else - /bin/sh -c -e "cat ${CEPH_CONF}.template | sed 's/mon_host.*/mon_host = ${ENDPOINT}/g' | tee ${CEPH_CONF}" || true - fi -fi - if [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then export OSD_DEVICE="/var/lib/ceph/osd" else @@ -50,54 +36,6 @@ else export OSD_JOURNAL=$(readlink -f ${JOURNAL_LOCATION}) fi - -function udev_settle { - partprobe "${OSD_DEVICE}" - # watch the udev event queue, and exit if all current events are handled - udevadm settle --timeout=600 -} - -# Calculate proper device names, given a device and partition number -function dev_part { - local OSD_DEVICE=${1} - local OSD_PARTITION=${2} - - if [[ -L ${OSD_DEVICE} ]]; then - # This device is a symlink. Work out it's actual device - local ACTUAL_DEVICE=$(readlink -f ${OSD_DEVICE}) - local BN=$(basename ${OSD_DEVICE}) - if [[ "${ACTUAL_DEVICE:0-1:1}" == [0-9] ]]; then - local DESIRED_PARTITION="${ACTUAL_DEVICE}p${OSD_PARTITION}" - else - local DESIRED_PARTITION="${ACTUAL_DEVICE}${OSD_PARTITION}" - fi - # Now search for a symlink in the directory of $OSD_DEVICE - # that has the correct desired partition, and the longest - # shared prefix with the original symlink - local SYMDIR=$(dirname ${OSD_DEVICE}) - local LINK="" - local PFXLEN=0 - for OPTION in $(ls $SYMDIR); do - if [[ $(readlink -f $SYMDIR/$OPTION) == $DESIRED_PARTITION ]]; then - local OPT_PREFIX_LEN=$(prefix_length $OPTION $BN) - if [[ $OPT_PREFIX_LEN > $PFXLEN ]]; then - LINK=$SYMDIR/$OPTION - PFXLEN=$OPT_PREFIX_LEN - fi - fi - done - if [[ $PFXLEN -eq 0 ]]; then - >&2 log "Could not locate appropriate symlink for partition ${OSD_PARTITION} of ${OSD_DEVICE}" - exit 1 - fi - echo "$LINK" - elif [[ "${OSD_DEVICE:0-1:1}" == [0-9] ]]; then - echo "${OSD_DEVICE}p${OSD_PARTITION}" - else - echo "${OSD_DEVICE}${OSD_PARTITION}" - fi -} - function osd_disk_prepare { if [[ -z "${OSD_DEVICE}" ]];then echo "ERROR- You must provide a device to build your OSD ie: /dev/sdb" @@ -119,7 +57,7 @@ function osd_disk_prepare { if ! parted --script ${OSD_DEVICE} print > /dev/null 2>&1; then if [[ ${OSD_FORCE_ZAP} -eq 1 ]]; then echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_ZAP is enabled so we are zapping the device anyway" - ceph-disk -v zap ${OSD_DEVICE} + sgdisk -Z ${OSD_DEVICE} else echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird." echo "It would be too dangerous to destroy it without any notification." @@ -134,18 +72,18 @@ function osd_disk_prepare { if [[ "$(parted --script ${OSD_DEVICE} print | egrep '^ 1.*ceph data')" ]]; then if [[ ${OSD_FORCE_ZAP} -eq 1 ]]; then if [ -b "${OSD_DEVICE}1" ]; then - local cephFSID=`ceph-conf --lookup fsid` + local cephFSID=$(ceph-conf --lookup fsid) if [ ! -z "${cephFSID}" ]; then - local tmpmnt=`mktemp -d` + local tmpmnt=$(mktemp -d) mount ${OSD_DEVICE}1 ${tmpmnt} if [ -f "${tmpmnt}/ceph_fsid" ]; then - osdFSID=`cat "${tmpmnt}/ceph_fsid"` + osdFSID=$(cat "${tmpmnt}/ceph_fsid") umount ${tmpmnt} if [ ${osdFSID} != ${cephFSID} ]; then echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster." echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}" echo "Because OSD_FORCE_ZAP was set, we will zap this device." - ceph-disk -v zap ${OSD_DEVICE} + sgdisk -Z ${OSD_DEVICE} else echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster." echo "OSD_FORCE_ZAP is set, but will be ignored and the device will not be zapped." @@ -156,7 +94,7 @@ function osd_disk_prepare { umount ${tmpmnt} echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID." echo "Because OSD_FORCE_ZAP was set, we will zap this device." - ceph-disk -v zap ${OSD_DEVICE} + sgdisk -Z ${OSD_DEVICE} fi else echo "Unable to determine the FSID of the current cluster." @@ -182,12 +120,12 @@ function osd_disk_prepare { # we only care about journals for filestore. if [ -n "${OSD_JOURNAL}" ]; then if [ -b $OSD_JOURNAL ]; then - OSD_JOURNAL=`readlink -f ${OSD_JOURNAL}` - OSD_JOURNAL_PARTITION=`echo $OSD_JOURNAL_PARTITION | sed 's/[^0-9]//g'` + OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL}) + OSD_JOURNAL_PARTITION=$(echo $OSD_JOURNAL_PARTITION | sed 's/[^0-9]//g') if [ -z "${OSD_JOURNAL_PARTITION}" ]; then # maybe they specified the journal as a /dev path like '/dev/sdc12': - local JDEV=`echo ${OSD_JOURNAL} | sed 's/\(.*[^0-9]\)[0-9]*$/\1/'` - if [ -d /sys/block/`basename $JDEV`/`basename $OSD_JOURNAL` ]; then + local JDEV=$(echo ${OSD_JOURNAL} | sed 's/\(.*[^0-9]\)[0-9]*$/\1/') + if [ -d /sys/block/$(basename ${JDEV})/$(basename ${OSD_JOURNAL}) ]; then OSD_JOURNAL=$(dev_part ${JDEV} `echo ${OSD_JOURNAL} |\ sed 's/.*[^0-9]\([0-9]*\)$/\1/'`) OSD_JOURNAL_PARTITION=${JDEV} @@ -198,8 +136,8 @@ function osd_disk_prepare { fi chown ceph. ${OSD_JOURNAL} else - echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}" - echo "For better performance, consider moving your journal to a separate device" + echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}" + echo "For better performance on HDD, consider moving your journal to a separate device" fi CLI_OPTS="${CLI_OPTS} --filestore" else @@ -212,12 +150,12 @@ function osd_disk_prepare { echo "OSD_FORCE_ZAP is set, so we will erase the journal device ${OSD_JOURNAL}" if [ -z "${OSD_JOURNAL_PARTITION}" ]; then # it's a raw block device. nuke any existing partition table. - parted -s ${OSD_JOURNAL} mklabel msdos + sgdisk -Z ${OSD_JOURNAL} else # we are likely working on a partition. Just make a filesystem on # the device, as other partitions may be in use so nuking the whole # disk isn't safe. - mkfs -t xfs -f ${OSD_JOURNAL} + wipefs ${OSD_JOURNAL} fi fi diff --git a/ceph-osd/templates/bin/osd/_start.sh.tpl b/ceph-osd/templates/bin/osd/_start.sh.tpl index d71e445745..7b0e7cf638 100644 --- a/ceph-osd/templates/bin/osd/_start.sh.tpl +++ b/ceph-osd/templates/bin/osd/_start.sh.tpl @@ -17,19 +17,6 @@ limitations under the License. */}} set -ex -: "${CEPH_CONF:="/etc/ceph/${CLUSTER}.conf"}" - -if [[ ! -e ${CEPH_CONF}.template ]]; then - echo "ERROR- ${CEPH_CONF}.template must exist; get it from your existing mon" - exit 1 -else - ENDPOINT=$(kubectl get endpoints ceph-mon -n ${NAMESPACE} -o json | awk -F'"' -v port=${MON_PORT} '/ip/{print $4":"port}' | paste -sd',') - if [[ ${ENDPOINT} == "" ]]; then - /bin/sh -c -e "cat ${CEPH_CONF}.template | tee ${CEPH_CONF}" || true - else - /bin/sh -c -e "cat ${CEPH_CONF}.template | sed 's/mon_host.*/mon_host = ${ENDPOINT}/g' | tee ${CEPH_CONF}" || true - fi -fi echo "LAUNCHING OSD: in ${STORAGE_TYPE%-*}:${STORAGE_TYPE#*-} mode" exec "/tmp/osd-${STORAGE_TYPE%-*}.sh" diff --git a/ceph-osd/templates/configmap-bin.yaml b/ceph-osd/templates/configmap-bin.yaml index d9b277dcf4..7da5d63e13 100644 --- a/ceph-osd/templates/configmap-bin.yaml +++ b/ceph-osd/templates/configmap-bin.yaml @@ -38,6 +38,8 @@ data: {{ tuple "bin/osd/_check.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} osd-stop.sh: | {{ tuple "bin/osd/_stop.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + osd-common.sh: | +{{ tuple "bin/osd/_common.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} init-dirs.sh: | {{ tuple "bin/_init-dirs.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} helm-tests.sh: | diff --git a/ceph-osd/templates/daemonset-osd.yaml b/ceph-osd/templates/daemonset-osd.yaml index c8b5bd91b2..a6ac178963 100644 --- a/ceph-osd/templates/daemonset-osd.yaml +++ b/ceph-osd/templates/daemonset-osd.yaml @@ -76,6 +76,14 @@ spec: mountPath: /tmp/init-dirs.sh subPath: init-dirs.sh readOnly: true + - name: ceph-osd-bin + mountPath: /tmp/osd-common.sh + subPath: osd-common.sh + readOnly: true + - name: ceph-osd-etc + mountPath: /etc/ceph/storage.json + subPath: storage.json + readOnly: true - name: pod-var-lib-ceph mountPath: /var/lib/ceph readOnly: false @@ -149,10 +157,18 @@ spec: mountPath: /tmp/osd-init.sh subPath: osd-init.sh readOnly: true + - name: ceph-osd-bin + mountPath: /tmp/osd-common.sh + subPath: osd-common.sh + readOnly: true - name: ceph-osd-etc mountPath: /etc/ceph/ceph.conf.template subPath: ceph.conf readOnly: true + - name: ceph-osd-etc + mountPath: /etc/ceph/storage.json + subPath: storage.json + readOnly: true - name: ceph-bootstrap-osd-keyring mountPath: /var/lib/ceph/bootstrap-osd/ceph.keyring subPath: ceph.keyring @@ -252,6 +268,10 @@ spec: mountPath: /tmp/utils-checkDNS.sh subPath: utils-checkDNS.sh readOnly: true + - name: ceph-osd-bin + mountPath: /tmp/osd-common.sh + subPath: osd-common.sh + readOnly: true - name: ceph-osd-etc mountPath: /etc/ceph/storage.json subPath: storage.json