training-guides/labs/scripts/test/launch_instance.sh
Roger Luethi 6bdf7e2e47 labs: fix HTTP_EXCEPTIONS check
If HTTP_EXCEPTIONS is not 0, it is a string. In that case, the
comparison "-ne 0" results in an error and the script aborts.
Use string comparison instead.

Change-Id: I2e9311e6e0577b213281eaa7b0227ca51d59368d
2015-03-28 09:30:14 +01:00

764 lines
22 KiB
Bash
Executable File

#!/usr/bin/env bash
set -o errexit -o nounset
TOP_DIR=$(cd "$(dirname "$0")/.." && pwd)
source "$TOP_DIR/config/paths"
source "$CONFIG_DIR/credentials"
source "$LIB_DIR/functions.guest"
source "$CONFIG_DIR/demo-openstackrc.sh"
exec_logfile
indicate_current_auto
#------------------------------------------------------------------------------
# Launch a demo instance.
#------------------------------------------------------------------------------
# Packets from the instance VM destined for the Internet will have its
# floating IP address as the sender address. For your instance VM to
# get Internet access, you will probably have to configure masquerading
# on your host computer.
# On Linux, turning on masquerading may look something like this:
# echo "1" > /proc/sys/net/ipv4/ip_forward
# modprobe ip_tables
# modprobe ip_conntrack
# iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE
# iptables -A FORWARD -i eth0 -o vboxnet2 -m state \
# --state RELATED,ESTABLISHED -j ACCEPT
# iptables -A FORWARD -i vboxnet2 -o eth0 -j ACCEPT
# Set this true if you have masquerading enabled to allow instance VMs access
# to the Internet.
: ${MASQUERADING:=true}
# Set this true if you want the instance to use the Google Public DNS name
# server. The default uses dnsmasq running on a node.
: ${EXT_DNS:=true}
DEMO_INSTANCE_NAME=demo-instance1
echo "SUM --- BEGIN"
function ssh_no_chk_node {
ssh_no_chk -i "$HOME/.ssh/osbash_key" "$@"
}
function ssh_no_chk {
echo "ssh $@"
# Options set to disable strict host key checking and related messages.
ssh \
-o "UserKnownHostsFile /dev/null" \
-o "StrictHostKeyChecking no" \
-o LogLevel=error \
"$@"
}
# Work around neutron client failing with unsupported locale settings
if [[ "$(neutron --help)" == "unsupported locale setting" ]]; then
echo "Locale not supported on node, setting LC_ALL=C."
export LC_ALL=C
fi
function wait_for_service {
local node=$1
local service=$2
local cnt=0
echo -n "Node $node, service $service:"
until ssh_no_chk_node "$node" service "$service" status | \
grep -q "start/running"; do
cnt=$((cnt + 1))
if [ $((cnt % 150)) -eq 0 ]; then
echo " does not seem to come up. Forcing restart."
echo
echo "SUM ERROR $service on node $node not coming up."
ssh_no_chk_node "$node" \
sudo service "$service" restart
SERVICE_RESTARTS="${SERVICE_RESTARTS:-""}$service@$node "
fi
sleep 2
echo -n .
done
echo " up"
}
echo "Running on host: $(hostname)"
echo "Checking network connection to network node."
ping -c1 network-mgmt
echo
echo "Checking network connection to compute node."
ping -c1 compute-mgmt
echo
echo "Checking services on network node."
wait_for_service network-mgmt openvswitch-switch
wait_for_service network-mgmt neutron-plugin-openvswitch-agent
wait_for_service network-mgmt neutron-l3-agent
wait_for_service network-mgmt neutron-dhcp-agent
wait_for_service network-mgmt neutron-metadata-agent
echo
echo "Checking services on compute node."
wait_for_service compute-mgmt nova-compute
wait_for_service compute-mgmt openvswitch-switch
wait_for_service compute-mgmt neutron-plugin-openvswitch-agent
echo
function wait_for_nova_compute {
if sudo nova-manage service list --service nova-compute | \
grep -q ":-)"; then
return 0
fi
echo " Waiting for nova-compute to switch from XXX to :-)."
if ssh_no_chk_node compute-mgmt service nova-compute status | \
grep -q "start/running"; then
echo -n " Service is up, waiting (may take a few minutes)."
fi
local cnt=0
local start=$(date +%s)
while sudo nova-manage service list --service nova-compute | grep -q XXX; do
cnt=$((cnt + 1))
sleep 5
if ssh_no_chk_node compute-mgmt service nova-compute status | \
grep -q "start/running"; then
if [ $cnt -eq 300 ]; then
# This should never happen.
echo "SUM ERROR nova-compute remains XXX while up."
echo "Aborting."
exit
fi
echo -n k
else
echo
echo "SUM ERROR nova-compute on compute node has died."
echo "Restarting nova-compute on compute node."
ssh_no_chk_node compute-mgmt \
sudo service nova-compute restart
NOVA_COMPUTE_RESTART=$((${NOVA_COMPUTE_RESTART:-0} + 1))
fi
done
echo
}
function wait_for_nova_services {
local start=$(date +%s)
echo "Checking services in sudo nova-manage service list."
echo -n " Waiting for controller services to switch from XXX to :-)."
# Ignore nova-compute for now, even if a custom config has it on controller
while sudo nova-manage service list --host controller | \
grep -v nova-compute | grep -q XXX; do
sleep 2
echo -n .
done
echo
if ! sudo nova-manage service list | grep -q nova-compute; then
echo -n " Waiting for nova-compute to turn up in list."
until sudo nova-manage service list | grep -q nova-compute; do
sleep 2
echo -n .
done
echo
fi
wait_for_nova_compute
echo
echo "SUM wait for nova services: $(($(date +%s) - start))"
}
if [ ${NOVA_COMPUTE_RESTART:-0} -ne 0 ]; then
echo "SUM ERROR nova-compute restarts: $NOVA_COMPUTE_RESTART"
fi
wait_for_nova_services
echo "All services are ready:"
sudo nova-manage service list
echo
function show_compute_resource_usage {
echo "nova list:"
nova list
(
source "$CONFIG_DIR/admin-openstackrc.sh"
echo "As admin user, nova host-list:"
nova host-list
echo "As admin user, nova host-describe compute:"
nova host-describe compute
)
}
function wait_for_neutron_agents {
local agent_list=$LOG_DIR/test-agent.list
local start=$(date +%s)
echo -n "Waiting for agents in neutron agent-list."
(
source "$CONFIG_DIR/admin-openstackrc.sh"
neutron agent-list | sort > "$agent_list"
local out=$(grep " :-) " "$agent_list" || rc=$?)
if [ -n "$out" ]; then
echo
echo "$out"
fi
while [ : ]; do
neutron agent-list | sort > "$agent_list.new"
out=$(comm -13 "$agent_list" "$agent_list.new")
if [ -n "$out" ]; then
echo
echo "$out"
fi
if ! grep -q " xxx " "$agent_list"; then
break
fi
mv "$agent_list.new" "$agent_list"
sleep 1
echo -n .
done
echo
echo "All agents are ready."
neutron agent-list
echo
)
echo "SUM wait for neutron agents: $(($(date +%s) - start))"
}
wait_for_neutron_agents
function check_namespaces {
local cnt
echo -n "Getting router namespace."
cnt=0
until ssh_no_chk_node network-mgmt ip netns | grep qrouter; do
cnt=$((cnt + 1))
sleep 1
echo -n "."
done
echo "SUM wait for router namespace: $cnt"
local nsrouter=$(ssh_no_chk_node network-mgmt ip netns | grep qrouter)
echo -n "Getting DHCP namespace."
cnt=0
until ssh_no_chk_node network-mgmt ip netns | grep qdhcp; do
cnt=$((cnt + 1))
if [ $cnt -eq 10 ]; then
echo
echo "SUM ERROR No DCHP namespace, restarting neutron-dhcp-agent."
echo "Restarting neutron-dhcp-agent on network node."
ssh_no_chk_node network-mgmt \
sudo service neutron-dhcp-agent restart
fi
sleep 1
echo -n "."
done
echo "SUM wait for DHCP namespace: $cnt"
local nsdhcp=$(ssh_no_chk_node network-mgmt ip netns | grep qdhcp)
echo -n "Waiting for interface qr-* in router namespace."
cnt=0
until ssh_no_chk_node network-mgmt \
sudo ip netns exec "$nsrouter" ip addr | \
grep -Po "(?<=: )qr-.*(?=:)"; do
cnt=$((cnt + 1))
sleep 1
echo -n "."
done
echo "SUM wait for interface qr-*: $cnt"
echo -n "Waiting for interface qg-* in router namespace."
cnt=0
until ssh_no_chk_node network-mgmt \
sudo ip netns exec "$nsrouter" ip addr | \
grep -Po "(?<=: )qg-.*(?=:)"; do
cnt=$((cnt + 1))
sleep 1
echo -n "."
done
echo "SUM wait for interface qg-*: $cnt"
echo -n "Waiting for interface tap* in DHCP namespace."
cnt=0
until ssh_no_chk_node network-mgmt \
sudo ip netns exec "$nsdhcp" ip addr | \
grep -Po "(?<=: )tap.*(?=:)"; do
cnt=$((cnt + 1))
sleep 1
echo -n "."
done
echo "SUM wait for interface tap*: $cnt"
}
check_namespaces
if [ ! -f ~/.ssh/id_rsa ]; then
echo "Generating an ssh key pair (saved to ~/.ssh/id_rsa*)."
# For training cluster: no password protection on keys to make scripting
# easier
ssh-keygen -f ~/.ssh/id_rsa -N ""
fi
function check_demo_key {
echo -n "Checking if 'demo-key' is already in our OpenStack environment: "
if nova keypair-show demo-key >/dev/null 2>&1; then
echo "yes."
echo -n "Checking if the 'demo-key' key pair matches our ssh key: "
ssh_key=$(< ~/.ssh/id_rsa.pub awk '{print $2}')
stored_key=$(nova keypair-show demo-key | \
awk '/^Public key: ssh-rsa/ {print $4}')
if [ "$ssh_key" != "$stored_key" ]; then
echo "no."
echo "Removing the 'demo-key' from the OpenStack envirnoment."
nova keypair-delete demo-key
else
echo "yes."
fi
else
echo "no."
fi
}
check_demo_key
if ! nova keypair-show demo-key 2>/dev/null; then
echo "Adding the public key to our OpenStack environment."
nova keypair-add --pub-key ~/.ssh/id_rsa.pub demo-key
fi
echo "Verifying addition of the public key."
nova keypair-list
echo "Listing available flavors."
nova flavor-list
echo "Listing available images."
nova image-list
echo -n "Waiting for neutron to start."
until neutron net-list >/dev/null 2>&1; do
sleep 1
echo .
done
echo
echo "Listing available networks."
neutron net-list
DEMO_NET_ID=$(neutron net-list | awk '/ demo-net / {print $2}')
echo "ID for demo-net tenant network: $DEMO_NET_ID"
echo "Listing available security groups."
nova secgroup-list
if [ "$EXT_DNS" = true ]; then
echo "Setting DNS name server for subnet (passed to booting instance VMs)."
neutron subnet-update demo-subnet --dns_nameservers list=true 8.8.4.4
echo
else
echo "Clearing DNS name server for subnet (passed to booting instance VMs)."
neutron subnet-update demo-subnet --dns_nameservers action=clear
fi
echo "Settings for demo-subnet:"
neutron subnet-show demo-subnet
echo
nova list
nova list | awk " / $DEMO_INSTANCE_NAME / {print \$2}" | while read instance; do
echo "Removing instance $DEMO_INSTANCE_NAME ($instance)."
nova delete "$instance"
done
echo -n "Waiting for removed instances to disappear (may take > 1 min)."
while nova list|grep -q "$DEMO_INSTANCE_NAME"; do
sleep 1
echo -n .
done
echo
echo "There should be no $DEMO_INSTANCE_NAME instances left:"
nova list
NOVA_SCHED_LOG=/var/log/upstart/nova-scheduler.log
NOVA_API_LOG=/var/log/upstart/nova-api.log
VM_LAUNCHES=0
function request_instance {
# Keep a copy of current state of nova-scheduler.log
sudo cp -vf $NOVA_SCHED_LOG $NOVA_API_LOG /tmp
if [ -n "${instance_info:-""}" ]; then
rm -f "$instance_info"
else
instance_info=$LOG_DIR/test-instance.info
echo "Instance info: $instance_info"
fi
local img_name=$(basename "$CIRROS_URL" -disk.img)
echo "Requesting an instance."
nova boot \
--flavor m1.tiny \
--image "$img_name" \
--nic net-id="$DEMO_NET_ID" \
--security-group default \
--key-name demo-key \
"$DEMO_INSTANCE_NAME" > "$instance_info"
VM_LAUNCHES=$(( VM_LAUNCHES + 1 ))
}
BOOT_LOG=$LOG_DIR/test-instance.boot
echo "Boot log: $BOOT_LOG"
function save_boot_log {
local rc=0
rm -f "$BOOT_LOG"
nova console-log "$DEMO_INSTANCE_NAME" >"$BOOT_LOG" 2>&1 || rc=$?
if [ $rc -ne 0 ]; then
echo >&2 "nova console-log returned error status $rc"
fi
return $rc
}
function explain_instance_failure {
cat << TXT_INSTANCE_FAILURE
After deleting an instance, it can take nova up to a minute to realize that
the compute node is free. Under tight space constraints, this becomes a
common source of failure.
As an admin, we could list hosts (including compute hosts):
$ nova host-list
And check resource usage in description of host 'compute':
$ nova host-describe compute
As a regular user, we would have to keep trying for up to a minute and hope
it works soon.
The fastest way to update the database, however, is to restart nova-compute
on the compute node.
TXT_INSTANCE_FAILURE
}
function status_409_fixed {
echo "Checking log files for cause of failure."
if sudo comm -13 /tmp/nova-scheduler.log $NOVA_SCHED_LOG |
grep "has not been heard from in a while"; then
echo
echo "SUM ERROR Missing connection with nova-compute on compute node."
echo "(Did controller node boot after compute node?)"
echo
elif sudo comm -13 /tmp/nova-scheduler.log $NOVA_SCHED_LOG |
grep "Filter RamFilter returned 0 hosts"; then
echo "SUM ERROR Filter RamFilter returned 0 hosts"
explain_instance_failure
show_compute_resource_usage
elif sudo comm -13 /tmp/nova-api.log $NOVA_API_LOG |
grep "HTTP exception thrown:"; then
# Just waiting should be enough to fix this
echo -n "Waiting for HTTP status 409 to cure itself."
local cnt=0
until [ $cnt -eq 5 ]; do
if ! console_status_409; then
HTTP_EXCEPTIONS="${HTTP_EXCEPTIONS:-""}$cnt "
echo "okay"
# We can continue with this instance
return 0
fi
cnt=$((cnt + 1))
sleep 2
echo -n .
done
HTTP_EXCEPTIONS="${HTTP_EXCEPTIONS:-""}${cnt}-fail "
echo "failed"
else
echo "Unknown reason. See for yourself."
echo "nova-scheduler.log:"
sudo comm -13 /tmp/nova-scheduler.log $NOVA_SCHED_LOG
echo "nova-api.log:"
sudo comm -13 /tmp/nova-api.log $NOVA_API_LOG
echo "SUM ABORT Unknown 409 error"
exit 1
fi
# Not fixed, need to try with new VM
return 1
}
function console_status_409 {
! save_boot_log 2>/dev/null &&
grep -q "is not ready (HTTP 409)" "$BOOT_LOG"
}
function console_status_404 {
! save_boot_log 2>/dev/null &&
grep -q "Unable to get console (HTTP 404)" "$BOOT_LOG"
}
function instance_status {
nova list | awk "/$DEMO_INSTANCE_NAME/ {print \$6}"
}
function instance_status_is {
local status=$1
nova list | grep "$DEMO_INSTANCE_NAME" | grep -q "$status"
}
while [ : ]; do
echo "Launching an instance VM."
request_instance > /dev/null
if console_status_409; then
echo "nova console-log returned:"
cat "$BOOT_LOG"
echo
if ! status_409_fixed; then
echo "Instance build failed."
echo "Deleting failed instance VM."
nova delete "$DEMO_INSTANCE_NAME"
echo "Checking nova-compute on the compute node."
wait_for_nova_compute
echo -n "Requesting new instance VMs until it works."
cnt=0
while [ : ]; do
request_instance >/dev/null
if console_status_409; then
nova delete "$DEMO_INSTANCE_NAME"
cnt=$((cnt + 1))
if [ $cnt -eq 5 ]; then
echo
echo "SUM ERROR console status remains 409."
echo "Restarting nova-compute on compute node."
ssh_no_chk_node compute-mgmt \
sudo service nova-compute restart
NOVA_COMPUTE_RESTART=$((${NOVA_COMPUTE_RESTART:-0} + 1))
fi
sleep 2
echo -n .
else
# Either no error or a different error
echo
break
fi
done
fi
fi
if console_status_404; then
echo "nova console-log returned:"
cat "$BOOT_LOG"
echo
echo -n "Waiting for console."
# Console status 404 may persist after instance status becomes ERROR.
while console_status_404 && instance_status_is BUILD; do
sleep 1
echo -n .
done
echo
if ! console_status_404; then
echo "Console status is no longer 404."
fi
fi
echo -n "Waiting for instance to get out of BUILD status."
while instance_status_is BUILD; do
sleep 1
echo -n .
done
echo
if instance_status_is ERROR; then
echo "Instance VM status: ERROR"
echo "Deleting failed instance VM."
nova delete "$DEMO_INSTANCE_NAME"
elif instance_status_is ACTIVE; then
echo "Instance VM status: ACTIVE."
break
fi
done
if [ "${HTTP_EXCEPTIONS:-0}" != "0" ]; then
echo "SUM ERROR HTTP exceptions: ${HTTP_EXCEPTIONS:-0}"
fi
echo -n "Waiting for DHCP discover."
until grep -q "Sending discover..." "$BOOT_LOG"; do
sleep 2
echo -n .
save_boot_log
done
echo
echo -n "Waiting for DHCP success."
until grep -q "^Lease of" "$BOOT_LOG"; do
DHCP_WAIT=$((${DHCP_WAIT:-0} + 1))
if grep "No lease, failing" "$BOOT_LOG"; then
echo "SUM ABORT DHCP wait: fail (${DHCP_WAIT:-0})"
echo "Aborting."
exit 1
fi
sleep 2
echo -n .
save_boot_log
done
echo
echo "SUM DHCP wait: ${DHCP_WAIT:-0}"
echo
echo -n "Waiting for metadata success."
until grep -q "successful after" "$BOOT_LOG"; do
if grep "failed to read iid from metadata" "$BOOT_LOG"; then
echo "SUM ABORT failed to get metadata"
echo "Aborting."
exit 1
fi
sleep 2
echo -n .
save_boot_log
done
echo
echo -n "Waiting for login prompt."
until grep -q "$DEMO_INSTANCE_NAME login:" "$BOOT_LOG"; do
sleep 2
echo -n .
save_boot_log
done
echo
echo "Obtaining a VNC session URL for our instance."
nova get-vnc-console "$DEMO_INSTANCE_NAME" novnc
echo
echo "Permitting ICMP (ping) to our instances."
nova secgroup-add-rule default icmp -1 -1 0.0.0.0/0 2>/dev/null || rc=$?
if [ ${rc:-0} -ne 0 ]; then
echo "Rule was already there."
fi
echo
echo "Permitting secure shell (SSH) access to our instances."
nova secgroup-add-rule default tcp 22 22 0.0.0.0/0 2>/dev/null || rc=$?
if [ ${rc:-0} -ne 0 ]; then
echo "Rule was already there."
fi
echo
echo "Verifying security-group rules."
nova secgroup-list-rules default
echo
echo "Creating a floating IP address on the ext-net external network."
floating_ip_id=$(neutron floatingip-create ext-net | awk '/ id / {print $4}')
neutron floatingip-show "$floating_ip_id"
floating_ip=$(neutron floatingip-show "$floating_ip_id" |
awk '/ floating_ip_address / {print $4}')
echo
echo "Associating the floating IP address with our instance."
nova floating-ip-associate "$DEMO_INSTANCE_NAME" "$floating_ip"
echo
echo "Checking the status of your floating IP address."
nova list
echo
echo "Verifying network connectivity to instance VM."
ping -c1 "$floating_ip"
echo
echo "Accessing our instance using SSH from the controller node."
ssh_no_chk "cirros@$floating_ip" uptime
echo
echo "Pinging our own floating IP from inside the instance."
ssh_no_chk "cirros@$floating_ip" ping -c1 "$floating_ip"
echo
echo "Pinging IP address of controller-api."
ssh_no_chk "cirros@$floating_ip" ping -c1 "$(hostname_to_ip controller-api)"
if [ "$EXT_DNS" = true ]; then
echo "Skipping tests of dnsmasq /etc/hosts."
else
# Works only with dnsmasq using the node's /etc/hosts
echo
echo "Pinging controller-api (test local DNS name resolution)."
ssh_no_chk "cirros@$floating_ip" ping -c1 controller-api
echo
echo "Pinging network-api."
ssh_no_chk "cirros@$floating_ip" ping -c1 network-api
fi
if [ "$MASQUERADING" = true -a "$EXT_DNS" = false ]; then
echo
echo "This may work thanks to masquerading."
ssh_no_chk "cirros@$floating_ip" ping -c1 network-mgmt
echo
ssh_no_chk "cirros@$floating_ip" ping -c1 network-data
fi
function test_internet {
if [ "$MASQUERADING" = true ]; then
local ext_ping=1
echo
echo "Pinging Google Public DNS name server."
until ssh_no_chk "cirros@$floating_ip" ping -c1 8.8.8.8; do
if [ $ext_ping -eq 3 ]; then
echo "Failed. Giving up."
echo "SUM ERROR ping Internet: failed ($ext_ping)"
ext_ping="$ext_ping (failed)"
return 0
fi
echo
echo "Trying again in 1 s."
sleep 1
ext_ping=$((ext_ping + 1))
done
echo
echo "Testing DNS name resolution within instance VM."
ssh_no_chk "cirros@$floating_ip" ping -c1 openstack.org
fi
if [ ${ext_ping:-0} -ne 0 ]; then
echo "SUM ERROR ping Internet: ${ext_ping:-0}"
fi
}
test_internet
if [ "$EXT_DNS" = true ]; then
echo
echo "Removing DNS name servers from subnet."
neutron subnet-update demo-subnet --dns_nameservers action=clear
fi
echo
echo "Summary"
echo "======="
echo "SUM service restarts: ${SERVICE_RESTARTS:--}"
echo "SUM instance launches: $VM_LAUNCHES"
echo "SUM END"
echo
echo "Try this, it should work:"
echo "Command: 'ssh cirros@$floating_ip' [ password: 'cubswin:)' ]"