Add dynamic retention policies to curator

The curator retention policies will now query the storage nodes within
a given deployment and set a suitable index retention policy based on
the total amount of storage each index is assumed to produce every day.
To ensure we're minimizing the storage required and optimizing search
performance several actions are now being taken:

* Indexes will be shrunk after a quarter of their retention time.
* Indexes will be deleted should they exceed the retention time.

Change-Id: I8bf548620b5404d25deaadba8fda93452ef64fa0
Signed-off-by: Kevin Carter <kevin.carter@rackspace.com>
This commit is contained in:
Kevin Carter 2018-07-09 13:58:23 -05:00 committed by Kevin Carter (cloudnull)
parent 316f527243
commit b6a9a6fc7a
7 changed files with 193 additions and 165 deletions

View File

@ -16,6 +16,10 @@
body: "{{ item.index_options | to_json }}"
status_code: 200,400
body_format: json
register: elk_indexes
until: elk_indexes is success
retries: 3
delay: 5
with_items:
- name: "osprofiler-notifications"
index_options:

View File

@ -16,6 +16,7 @@
- name: Install Curator
hosts: "elastic-logstash"
become: true
gather_facts: false
vars:
haproxy_ssl: false
@ -25,6 +26,47 @@
environment: "{{ deployment_environment_variables | default({}) }}"
pre_tasks:
- include_tasks: common_task_data_node_hosts.yml
tags:
- always
- name: Query es storage
uri:
url: "http://127.0.0.1:9200/_nodes/{{ (data_nodes | map('extract', hostvars, 'ansible_host') | list) | join(',') }}/stats/fs"
method: GET
register: elk_data
until: elk_data is success
retries: 3
delay: 5
run_once: true
- name: Set available storage fact
set_fact:
es_total_available_storage: "{{ ((elk_data['json']['nodes'].values() | list) | map(attribute='fs.total.total_in_bytes') | list | sum) // 1024 // 1024 }}"
- name: Set assumed buffer storage fact
set_fact:
es_assumed_buffer_storage: "{{ ((es_total_available_storage | int) * 0.25) | round | int }}"
- name: Set usable buffer storage fact(s)
set_fact:
es_usable_buffer_storage: "{{ (es_total_available_storage | int) - (es_assumed_buffer_storage | int) }}"
es_expected_storage: "{{ ((elastic_beat_retention_policy_hosts.values() | map('int') | list) | sum) * (elastic_beat_storage_constant | int) }}"
- name: Set buffer storage fact
set_fact:
es_assumed_usable_storage_per_node: "{{ (es_usable_buffer_storage | int) // (data_nodes | length | int) }}"
- name: Set storage the mulitplier
set_fact:
es_storage_multiplier: "{{ ((es_usable_buffer_storage | int) < (es_expected_storage | int)) | ternary(((elastic_beat_storage_constant | int) * 2), elastic_beat_storage_constant | int) }}"
- name: Set retention facts
set_fact: "elastic_{{ item.key }}_retention={{ (es_assumed_usable_storage_per_node | int) // ((item.value | int) * (es_storage_multiplier | int)) }}"
when:
- hostvars[inventory_hostname]["elastic_" + item.key + "_retention"] is undefined
with_dict: "{{ elastic_beat_retention_policy_hosts }}"
- name: Ensure virtualenv is installed
apt:
name: "{{ item }}"

View File

@ -14,8 +14,8 @@
# limitations under the License.
- import_playbook: installElastic.yml
- import_playbook: installCurator.yml
- import_playbook: installLogstash.yml
- import_playbook: installCurator.yml
- import_playbook: installKibana.yml
- import_playbook: installAPMserver.yml
- import_playbook: createElasticIndexes.yml

View File

@ -13,156 +13,112 @@
# See the License for the specific language governing permissions and
# limitations under the License.
actions:
1:
action: delete_indices
description: >-
Delete indices older than 60 days (based on index name), for logstash-
prefixed indices. Ignore the error if the filter does not result in an
actionable list of indices (ignore_empty_list) and exit cleanly.
options:
ignore_empty_list: True
disable_action: False
filters:
- filtertype: pattern
kind: prefix
value: logstash-
- filtertype: age
source: name
direction: older
timestring: '%Y.%m.%d'
unit: days
unit_count: {{ elastic_logstash_retention }}
2:
action: delete_indices
description: >-
Delete indices older than 10 days (based on index name), for apm-
prefixed indices. Ignore the error if the filter does not result in an
actionable list of indices (ignore_empty_list) and exit cleanly.
options:
ignore_empty_list: True
disable_action: False
filters:
- filtertype: pattern
kind: prefix
value: apm-
- filtertype: age
source: name
direction: older
timestring: '%Y.%m.%d'
unit: days
unit_count: {{ elastic_apm_retention }}
3:
action: delete_indices
description: >-
Delete indices older than 15 days (based on index name), for auditbeat-
prefixed indices. Ignore the error if the filter does not result in an
actionable list of indices (ignore_empty_list) and exit cleanly.
options:
ignore_empty_list: True
disable_action: False
filters:
- filtertype: pattern
kind: prefix
value: auditbeat-
- filtertype: age
source: name
direction: older
timestring: '%Y.%m.%d'
unit: days
unit_count: {{ elastic_auditbeat_retention }}
4:
action: delete_indices
description: >-
Delete indices older than 15 days (based on index name), for filebeat-
prefixed indices. Ignore the error if the filter does not result in an
actionable list of indices (ignore_empty_list) and exit cleanly.
options:
ignore_empty_list: True
disable_action: False
filters:
- filtertype: pattern
kind: prefix
value: filebeat-
- filtertype: age
source: name
direction: older
timestring: '%Y.%m.%d'
unit: days
unit_count: {{ elastic_filebeat_retention }}
5:
action: delete_indices
description: >-
Delete indices older than 10 days (based on index name), for heartbeat-
prefixed indices. Ignore the error if the filter does not result in an
actionable list of indices (ignore_empty_list) and exit cleanly.
options:
ignore_empty_list: True
disable_action: False
filters:
- filtertype: pattern
kind: prefix
value: heartbeat-
- filtertype: age
source: name
direction: older
timestring: '%Y.%m.%d'
unit: days
unit_count: {{ elastic_heartbeat_retention }}
6:
action: delete_indices
description: >-
Delete indices older than 15 days (based on index name), for journalbeat-
prefixed indices. Ignore the error if the filter does not result in an
actionable list of indices (ignore_empty_list) and exit cleanly.
options:
ignore_empty_list: True
disable_action: False
filters:
- filtertype: pattern
kind: prefix
value: journalbeat-
- filtertype: age
source: name
direction: older
timestring: '%Y.%m.%d'
unit: days
unit_count: {{ elastic_journalbeat_retention }}
7:
action: delete_indices
description: >-
Delete indices older than 10 days (based on index name), for metricbeat-
prefixed indices. Ignore the error if the filter does not result in an
actionable list of indices (ignore_empty_list) and exit cleanly.
options:
ignore_empty_list: True
disable_action: False
filters:
- filtertype: pattern
kind: prefix
value: metricbeat-
- filtertype: age
source: name
direction: older
timestring: '%Y.%m.%d'
unit: days
unit_count: {{ elastic_metricbeat_retention }}
8:
action: delete_indices
description: >-
Delete indices older than 5 days (based on index name), for packetbeat-
prefixed indices. Ignore the error if the filter does not result in an
actionable list of indices (ignore_empty_list) and exit cleanly.
options:
ignore_empty_list: True
disable_action: False
filters:
- filtertype: pattern
kind: prefix
value: packetbeat-
- filtertype: age
source: name
direction: older
timestring: '%Y.%m.%d'
unit: days
unit_count: {{ elastic_packetbeat_retention }}
{% set action_items = [] -%}
{# Delete index loop #}
{% for key in elastic_beat_retention_policy_hosts.keys() -%}
{% set delete_indices = {} -%}
{% set index_retention = hostvars[inventory_hostname]['elastic_' + key + '_retention'] -%}
{% set _ = delete_indices.update(
{
'action': 'delete_indices',
'description': 'Prune indices for ' + key + ' after ' ~ ((index_retention | int) * 2) ~ ' days.',
'options': {
'ignore_empty_list': true,
'disable_action': false
}
}
)
-%}
{# add the filter loop #}
{% set filters = [] -%}
{% set _ = filters.append(
{
'filtertype': 'pattern',
'kind': 'prefix',
'value': key + '-'
}
)
-%}
{% set _ = filters.append(
{
'filtertype': 'age',
'source': 'name',
'direction': 'older',
'timestring': '%Y.%m.%d',
'unit': 'days',
'unit_count': (index_retention | int)
}
)
-%}
{% set _ = delete_indices.update({'filters': filters}) -%}
{% set _ = action_items.append(delete_indices) -%}
{# Set shrink curator options #}
{% set shrink_indices = {} -%}
{% set _ = shrink_indices.update(
{
'action': 'shrink',
'description': 'Shrink ' + key + ' indices older than ' ~ (index_retention | int) // 4 ~ ' days',
'options': {
"disable_action": false,
"ignore_empty_list": true,
"shrink_node": "DETERMINISTIC",
"node_filters": {
"permit_masters": ((master_nodes | length) < (data_nodes | length)) | ternary(true, false),
"exclude_nodes": (groups['kibana'] | map('extract', hostvars, 'ansible_host') | list)
},
"number_of_shards": 1,
"number_of_replicas": 1,
"shrink_suffix": '-shrink',
"copy_aliases": true,
"delete_after": true,
"post_allocation": {
"allocation_type": "include",
"key": "node_tag",
"value": "cold"
},
"wait_for_active_shards": 1,
"extra_settings": {
"settings": {
"index.codec": "best_compression"
}
},
"wait_for_completion": true,
"wait_for_rebalance": true,
"wait_interval": 9,
"max_wait": -1
}
}
)
-%}
{% set filters = [] -%}
{% set _ = filters.append(
{
'filtertype': 'pattern',
'kind': 'prefix',
'value': key + '-'
}
)
-%}
{% set _ = filters.append(
{
'filtertype': 'age',
'source': 'creation_date',
'direction': 'older',
'unit': 'days',
'unit_count': (index_retention | int) // 4
}
)
-%}
{% set _ = shrink_indices.update({'filters': filters}) -%}
{% set _ = action_items.append(shrink_indices) -%}
{% endfor -%}
{% set actions = {} -%}
{% for action_item in action_items -%}
{% set _ = actions.update({loop.index: action_item}) -%}
{% endfor -%}
{# Render all actions #}
{% set curator_actions = {'actions': actions} -%}
{{ curator_actions | to_nice_yaml(indent=2) }}

View File

@ -28,7 +28,7 @@ appender.rolling.strategy.action.basepath = ${sys:es.logs.base_path}
appender.rolling.strategy.action.condition.type = IfFileName
appender.rolling.strategy.action.condition.glob = ${sys:es.logs.cluster_name}-*
appender.rolling.strategy.action.condition.nested_condition.type = IfLastModified
appender.rolling.strategy.action.condition.nested_condition.age = {{ elastic_logstash_retention }}D
appender.rolling.strategy.action.condition.nested_condition.age = {{ elastic_logstash_retention | default(1) }}D
rootLogger.level = info

View File

@ -1011,8 +1011,11 @@ filebeat.prospectors:
# Make sure not file is defined twice as this can lead to unexpected behaviour.
paths:
- /var/log/beats/*.log
- /openstack/log/*/beats/*.log
- /var/log/curator/curator
- /openstack/log/*/curator/curator
- /var/log/elasticsearch/*.log
- /openstack/log/*/elasticsearch/*.log
# Optional additional fields. These fields can be freely picked
# to add additional information to the crawled log files for filtering

View File

@ -13,16 +13,39 @@ elastic_hap_port: 9201
cluster_name: openstack_elk
node_name: ${HOSTNAME}
# elastic curator vars
# all retention options are in days
elastic_logstash_retention: 14
elastic_apm_retention: 3
elastic_auditbeat_retention: 7
elastic_filebeat_retention: 7
elastic_heartbeat_retention: 7
elastic_journalbeat_retention: 14
elastic_metricbeat_retention: 3
elastic_packetbeat_retention: 3
### Elastic curator variables
## Default retention policy options. All retention options are in days.
# elastic_logstash_retention: 1
# elastic_apm_retention: 1
# elastic_auditbeat_retention: 1
# elastic_filebeat_retention: 1
# elastic_heartbeat_retention: 1
# elastic_journalbeat_retention: 1
# elastic_metricbeat_retention: 1
# elastic_packetbeat_retention: 1
# This is used to calculate the storage a beat could generate per node, per day.
# This constant is used as a multiplier. If the expected storage is larger than
# the actual available storage after the buffer is calculated the multiplier
# will be doubled there-by cutting the potential storage days in half.
elastic_beat_storage_constant: 512
## If any retention policy option is undefined a dynamic fact will be generated.
## Fact will be generated for the general retention using the storage constant
## per node, per index, where a given collector is expected to be deployed. The
## equation used will take the total available storage from the ES data nodes
## subtract 25% divided by the total number of data nodes. That is then divided
## by number of hosts assumed to be a beat target which is multiplied by the
## storage constant.
elastic_beat_retention_policy_hosts:
logstash: "{{ groups['elastic-logstash'] | default([null]) | length }}"
apm: "{{ groups['apm-server'] | default([null]) | length }}"
auditbeat: "{{ groups['hosts'] | default([null]) | length }}"
filebeat: "{{ groups['hosts'] | default([null]) | length }}"
heartbeat: "{{ groups['utility_all'] | default([null]) | length }}"
journalbeat: "{{ groups['all'] | default([null]) | length }}"
metricbeat: "{{ groups['all'] | default([null]) | length }}"
packetbeat: "{{ groups['hosts'] | default([null]) | length }}"
# This is the URL external services can use to communicate with the
# elasticsearch cluster.