swift/swift/obj/auditor.py
2010-08-20 00:42:38 +00:00

230 lines
9.5 KiB
Python

# Copyright (c) 2010 OpenStack, LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import socket
import time
from hashlib import md5
from random import choice, random
from urllib import quote
from eventlet import Timeout
from swift.obj import server as object_server
from swift.common.bufferedhttp import http_connect
from swift.common.exceptions import ConnectionTimeout
from swift.common.ring import Ring
from swift.common.utils import get_logger, renamer
from swift.common.exceptions import AuditException
class ObjectAuditor(object):
"""Audit objects."""
def __init__(self, conf):
self.logger = get_logger(conf)
self.devices = conf.get('devices', '/srv/node')
self.mount_check = conf.get('mount_check', 'true').lower() in \
('true', 't', '1', 'on', 'yes', 'y')
self.interval = int(conf.get('interval', 1800))
swift_dir = conf.get('swift_dir', '/etc/swift')
self.container_ring_path = os.path.join(swift_dir, 'container.ring.gz')
self.container_ring = None
self.node_timeout = int(conf.get('node_timeout', 10))
self.conn_timeout = float(conf.get('conn_timeout', 0.5))
self.passes = 0
self.quarantines = 0
self.errors = 0
self.container_passes = 0
self.container_failures = 0
self.container_errors = 0
def get_container_ring(self):
"""
Get the container ring, loading it if neccesary.
:returns: container ring
"""
if not self.container_ring:
self.logger.debug(
'Loading container ring from %s' % self.container_ring_path)
self.container_ring = Ring(self.container_ring_path)
return self.container_ring
def audit_forever(self): # pragma: no cover
"""Run the object audit until stopped."""
reported = time.time()
time.sleep(random() * self.interval)
while True:
begin = time.time()
# read from container ring to ensure it's fresh
self.get_container_ring().get_nodes('')
for device in os.listdir(self.devices):
if self.mount_check and not \
os.path.ismount(os.path.join(self.devices, device)):
self.logger.debug(
'Skipping %s as it is not mounted' % device)
continue
self.object_audit(device)
if time.time() - reported >= 3600: # once an hour
self.logger.info(
'Since %s: Locally: %d passed audit, %d quarantined, %d '
'errors Remote audits with containers: %s passed audit, '
'%s failed audit, %s errors' %
(time.ctime(reported), self.passes, self.quarantines,
self.errors, self.container_passes,
self.container_failures, self.container_errors))
reported = time.time()
self.passes = 0
self.quarantines = 0
self.errors = 0
self.container_passes = 0
self.container_failures = 0
self.container_errors = 0
elapsed = time.time() - begin
if elapsed < self.interval:
time.sleep(self.interval - elapsed)
def audit_once(self):
"""Run the object audit once."""
self.logger.info('Begin object audit "once" mode')
begin = time.time()
for device in os.listdir(self.devices):
if self.mount_check and \
not os.path.ismount(os.path.join(self.devices, device)):
self.logger.debug(
'Skipping %s as it is not mounted' % device)
continue
self.object_audit(device)
elapsed = time.time() - begin
self.logger.info(
'Object audit "once" mode completed: %.02fs' % elapsed)
def object_audit(self, device):
"""Walk the device, and audit any objects found."""
datadir = os.path.join(self.devices, device, object_server.DATADIR)
if not os.path.exists(datadir):
return
name = None
partition = None
attempts = 100
while not name and attempts:
attempts -= 1
try:
partition = choice(os.listdir(datadir))
fpath = os.path.join(datadir, partition)
if not os.path.isdir(fpath):
continue
suffix = choice(os.listdir(fpath))
fpath = os.path.join(fpath, suffix)
if not os.path.isdir(fpath):
continue
hsh = choice(os.listdir(fpath))
fpath = os.path.join(fpath, hsh)
if not os.path.isdir(fpath):
continue
except IndexError:
continue
for fname in sorted(os.listdir(fpath), reverse=True):
if fname.endswith('.ts'):
break
if fname.endswith('.data'):
name = object_server.read_metadata(
os.path.join(fpath, fname))['name']
break
if not name:
return
_, account, container, obj = name.split('/', 3)
df = object_server.DiskFile(self.devices, device, partition, account,
container, obj, keep_data_fp=True)
try:
if os.path.getsize(df.data_file) != \
int(df.metadata['Content-Length']):
raise AuditException('Content-Length of %s does not match '
'file size of %s' % (int(df.metadata['Content-Length']),
os.path.getsize(df.data_file)))
etag = md5()
for chunk in df:
etag.update(chunk)
etag = etag.hexdigest()
if etag != df.metadata['ETag']:
raise AuditException("ETag of %s does not match file's md5 of "
"%s" % (df.metadata['ETag'], etag))
except AuditException, err:
self.quarantines += 1
self.logger.error('ERROR Object %s failed audit and will be '
'quarantined: %s' % (df.datadir, err))
renamer(df.datadir, os.path.join(self.devices, device,
'quarantined', 'objects', os.path.basename(df.datadir)))
return
except:
self.errors += 1
self.logger.exception('ERROR Trying to audit %s' % df.datadir)
return
self.passes += 1
found = False
good_response = False
results = []
part, nodes = self.get_container_ring().get_nodes(account, container)
for node in nodes:
try:
with ConnectionTimeout(self.conn_timeout):
conn = http_connect(node['ip'], node['port'],
node['device'], part, 'GET',
'/%s/%s' % (account, container),
query_string='prefix=%s' % quote(obj))
with Timeout(self.node_timeout):
resp = conn.getresponse()
body = resp.read()
if 200 <= resp.status <= 299:
for oname in body.split('\n'):
if oname == obj:
found = True
break
if found:
break
else:
results.append('%s:%s/%s %s %s = %s' % (node['ip'],
node['port'], node['device'], resp.status,
resp.reason, repr(body)))
else:
results.append('%s:%s/%s %s %s' %
(node['ip'], node['port'], node['device'],
resp.status, resp.reason))
except socket.error, err:
results.append('%s:%s/%s Socket Error: %s' % (node['ip'],
node['port'], node['device'], err))
except ConnectionTimeout:
results.append('%(ip)s:%(port)s/%(device)s ConnectionTimeout' %
node)
except Timeout:
results.append('%(ip)s:%(port)s/%(device)s Timeout' % node)
except Exception, err:
self.logger.exception('ERROR With remote server '
'%(ip)s:%(port)s/%(device)s' % node)
results.append('%s:%s/%s Exception: %s' % (node['ip'],
node['port'], node['device'], err))
if found:
self.container_passes += 1
self.logger.debug('Audit passed for %s %s' % (name, df.datadir))
else:
if good_response:
self.container_failures += 1
else:
self.container_errors += 1
self.logger.error('ERROR Could not find object %s %s on any of '
'the primary container servers it should be on: %s' % (name,
df.datadir, results))