Add dynamic retention policies to curator
The curator retention policies will now query the storage nodes within a given deployment and set a suitable index retention policy based on the total amount of storage each index is assumed to produce every day. To ensure we're minimizing the storage required and optimizing search performance several actions are now being taken: * Indexes will be shrunk after a quarter of their retention time. * Indexes will be deleted should they exceed the retention time. Change-Id: I8bf548620b5404d25deaadba8fda93452ef64fa0 Signed-off-by: Kevin Carter <kevin.carter@rackspace.com>
This commit is contained in:
parent
316f527243
commit
b6a9a6fc7a
@ -16,6 +16,10 @@
|
||||
body: "{{ item.index_options | to_json }}"
|
||||
status_code: 200,400
|
||||
body_format: json
|
||||
register: elk_indexes
|
||||
until: elk_indexes is success
|
||||
retries: 3
|
||||
delay: 5
|
||||
with_items:
|
||||
- name: "osprofiler-notifications"
|
||||
index_options:
|
||||
|
@ -16,6 +16,7 @@
|
||||
- name: Install Curator
|
||||
hosts: "elastic-logstash"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
haproxy_ssl: false
|
||||
|
||||
@ -25,6 +26,47 @@
|
||||
environment: "{{ deployment_environment_variables | default({}) }}"
|
||||
|
||||
pre_tasks:
|
||||
- include_tasks: common_task_data_node_hosts.yml
|
||||
tags:
|
||||
- always
|
||||
|
||||
- name: Query es storage
|
||||
uri:
|
||||
url: "http://127.0.0.1:9200/_nodes/{{ (data_nodes | map('extract', hostvars, 'ansible_host') | list) | join(',') }}/stats/fs"
|
||||
method: GET
|
||||
register: elk_data
|
||||
until: elk_data is success
|
||||
retries: 3
|
||||
delay: 5
|
||||
run_once: true
|
||||
|
||||
- name: Set available storage fact
|
||||
set_fact:
|
||||
es_total_available_storage: "{{ ((elk_data['json']['nodes'].values() | list) | map(attribute='fs.total.total_in_bytes') | list | sum) // 1024 // 1024 }}"
|
||||
|
||||
- name: Set assumed buffer storage fact
|
||||
set_fact:
|
||||
es_assumed_buffer_storage: "{{ ((es_total_available_storage | int) * 0.25) | round | int }}"
|
||||
|
||||
- name: Set usable buffer storage fact(s)
|
||||
set_fact:
|
||||
es_usable_buffer_storage: "{{ (es_total_available_storage | int) - (es_assumed_buffer_storage | int) }}"
|
||||
es_expected_storage: "{{ ((elastic_beat_retention_policy_hosts.values() | map('int') | list) | sum) * (elastic_beat_storage_constant | int) }}"
|
||||
|
||||
- name: Set buffer storage fact
|
||||
set_fact:
|
||||
es_assumed_usable_storage_per_node: "{{ (es_usable_buffer_storage | int) // (data_nodes | length | int) }}"
|
||||
|
||||
- name: Set storage the mulitplier
|
||||
set_fact:
|
||||
es_storage_multiplier: "{{ ((es_usable_buffer_storage | int) < (es_expected_storage | int)) | ternary(((elastic_beat_storage_constant | int) * 2), elastic_beat_storage_constant | int) }}"
|
||||
|
||||
- name: Set retention facts
|
||||
set_fact: "elastic_{{ item.key }}_retention={{ (es_assumed_usable_storage_per_node | int) // ((item.value | int) * (es_storage_multiplier | int)) }}"
|
||||
when:
|
||||
- hostvars[inventory_hostname]["elastic_" + item.key + "_retention"] is undefined
|
||||
with_dict: "{{ elastic_beat_retention_policy_hosts }}"
|
||||
|
||||
- name: Ensure virtualenv is installed
|
||||
apt:
|
||||
name: "{{ item }}"
|
||||
|
@ -14,8 +14,8 @@
|
||||
# limitations under the License.
|
||||
|
||||
- import_playbook: installElastic.yml
|
||||
- import_playbook: installCurator.yml
|
||||
- import_playbook: installLogstash.yml
|
||||
- import_playbook: installCurator.yml
|
||||
- import_playbook: installKibana.yml
|
||||
- import_playbook: installAPMserver.yml
|
||||
- import_playbook: createElasticIndexes.yml
|
||||
|
@ -13,156 +13,112 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
actions:
|
||||
1:
|
||||
action: delete_indices
|
||||
description: >-
|
||||
Delete indices older than 60 days (based on index name), for logstash-
|
||||
prefixed indices. Ignore the error if the filter does not result in an
|
||||
actionable list of indices (ignore_empty_list) and exit cleanly.
|
||||
options:
|
||||
ignore_empty_list: True
|
||||
disable_action: False
|
||||
filters:
|
||||
- filtertype: pattern
|
||||
kind: prefix
|
||||
value: logstash-
|
||||
- filtertype: age
|
||||
source: name
|
||||
direction: older
|
||||
timestring: '%Y.%m.%d'
|
||||
unit: days
|
||||
unit_count: {{ elastic_logstash_retention }}
|
||||
2:
|
||||
action: delete_indices
|
||||
description: >-
|
||||
Delete indices older than 10 days (based on index name), for apm-
|
||||
prefixed indices. Ignore the error if the filter does not result in an
|
||||
actionable list of indices (ignore_empty_list) and exit cleanly.
|
||||
options:
|
||||
ignore_empty_list: True
|
||||
disable_action: False
|
||||
filters:
|
||||
- filtertype: pattern
|
||||
kind: prefix
|
||||
value: apm-
|
||||
- filtertype: age
|
||||
source: name
|
||||
direction: older
|
||||
timestring: '%Y.%m.%d'
|
||||
unit: days
|
||||
unit_count: {{ elastic_apm_retention }}
|
||||
3:
|
||||
action: delete_indices
|
||||
description: >-
|
||||
Delete indices older than 15 days (based on index name), for auditbeat-
|
||||
prefixed indices. Ignore the error if the filter does not result in an
|
||||
actionable list of indices (ignore_empty_list) and exit cleanly.
|
||||
options:
|
||||
ignore_empty_list: True
|
||||
disable_action: False
|
||||
filters:
|
||||
- filtertype: pattern
|
||||
kind: prefix
|
||||
value: auditbeat-
|
||||
- filtertype: age
|
||||
source: name
|
||||
direction: older
|
||||
timestring: '%Y.%m.%d'
|
||||
unit: days
|
||||
unit_count: {{ elastic_auditbeat_retention }}
|
||||
4:
|
||||
action: delete_indices
|
||||
description: >-
|
||||
Delete indices older than 15 days (based on index name), for filebeat-
|
||||
prefixed indices. Ignore the error if the filter does not result in an
|
||||
actionable list of indices (ignore_empty_list) and exit cleanly.
|
||||
options:
|
||||
ignore_empty_list: True
|
||||
disable_action: False
|
||||
filters:
|
||||
- filtertype: pattern
|
||||
kind: prefix
|
||||
value: filebeat-
|
||||
- filtertype: age
|
||||
source: name
|
||||
direction: older
|
||||
timestring: '%Y.%m.%d'
|
||||
unit: days
|
||||
unit_count: {{ elastic_filebeat_retention }}
|
||||
5:
|
||||
action: delete_indices
|
||||
description: >-
|
||||
Delete indices older than 10 days (based on index name), for heartbeat-
|
||||
prefixed indices. Ignore the error if the filter does not result in an
|
||||
actionable list of indices (ignore_empty_list) and exit cleanly.
|
||||
options:
|
||||
ignore_empty_list: True
|
||||
disable_action: False
|
||||
filters:
|
||||
- filtertype: pattern
|
||||
kind: prefix
|
||||
value: heartbeat-
|
||||
- filtertype: age
|
||||
source: name
|
||||
direction: older
|
||||
timestring: '%Y.%m.%d'
|
||||
unit: days
|
||||
unit_count: {{ elastic_heartbeat_retention }}
|
||||
6:
|
||||
action: delete_indices
|
||||
description: >-
|
||||
Delete indices older than 15 days (based on index name), for journalbeat-
|
||||
prefixed indices. Ignore the error if the filter does not result in an
|
||||
actionable list of indices (ignore_empty_list) and exit cleanly.
|
||||
options:
|
||||
ignore_empty_list: True
|
||||
disable_action: False
|
||||
filters:
|
||||
- filtertype: pattern
|
||||
kind: prefix
|
||||
value: journalbeat-
|
||||
- filtertype: age
|
||||
source: name
|
||||
direction: older
|
||||
timestring: '%Y.%m.%d'
|
||||
unit: days
|
||||
unit_count: {{ elastic_journalbeat_retention }}
|
||||
7:
|
||||
action: delete_indices
|
||||
description: >-
|
||||
Delete indices older than 10 days (based on index name), for metricbeat-
|
||||
prefixed indices. Ignore the error if the filter does not result in an
|
||||
actionable list of indices (ignore_empty_list) and exit cleanly.
|
||||
options:
|
||||
ignore_empty_list: True
|
||||
disable_action: False
|
||||
filters:
|
||||
- filtertype: pattern
|
||||
kind: prefix
|
||||
value: metricbeat-
|
||||
- filtertype: age
|
||||
source: name
|
||||
direction: older
|
||||
timestring: '%Y.%m.%d'
|
||||
unit: days
|
||||
unit_count: {{ elastic_metricbeat_retention }}
|
||||
8:
|
||||
action: delete_indices
|
||||
description: >-
|
||||
Delete indices older than 5 days (based on index name), for packetbeat-
|
||||
prefixed indices. Ignore the error if the filter does not result in an
|
||||
actionable list of indices (ignore_empty_list) and exit cleanly.
|
||||
options:
|
||||
ignore_empty_list: True
|
||||
disable_action: False
|
||||
filters:
|
||||
- filtertype: pattern
|
||||
kind: prefix
|
||||
value: packetbeat-
|
||||
- filtertype: age
|
||||
source: name
|
||||
direction: older
|
||||
timestring: '%Y.%m.%d'
|
||||
unit: days
|
||||
unit_count: {{ elastic_packetbeat_retention }}
|
||||
{% set action_items = [] -%}
|
||||
{# Delete index loop #}
|
||||
{% for key in elastic_beat_retention_policy_hosts.keys() -%}
|
||||
{% set delete_indices = {} -%}
|
||||
{% set index_retention = hostvars[inventory_hostname]['elastic_' + key + '_retention'] -%}
|
||||
{% set _ = delete_indices.update(
|
||||
{
|
||||
'action': 'delete_indices',
|
||||
'description': 'Prune indices for ' + key + ' after ' ~ ((index_retention | int) * 2) ~ ' days.',
|
||||
'options': {
|
||||
'ignore_empty_list': true,
|
||||
'disable_action': false
|
||||
}
|
||||
}
|
||||
)
|
||||
-%}
|
||||
{# add the filter loop #}
|
||||
{% set filters = [] -%}
|
||||
{% set _ = filters.append(
|
||||
{
|
||||
'filtertype': 'pattern',
|
||||
'kind': 'prefix',
|
||||
'value': key + '-'
|
||||
}
|
||||
)
|
||||
-%}
|
||||
{% set _ = filters.append(
|
||||
{
|
||||
'filtertype': 'age',
|
||||
'source': 'name',
|
||||
'direction': 'older',
|
||||
'timestring': '%Y.%m.%d',
|
||||
'unit': 'days',
|
||||
'unit_count': (index_retention | int)
|
||||
}
|
||||
)
|
||||
-%}
|
||||
{% set _ = delete_indices.update({'filters': filters}) -%}
|
||||
{% set _ = action_items.append(delete_indices) -%}
|
||||
|
||||
{# Set shrink curator options #}
|
||||
{% set shrink_indices = {} -%}
|
||||
{% set _ = shrink_indices.update(
|
||||
{
|
||||
'action': 'shrink',
|
||||
'description': 'Shrink ' + key + ' indices older than ' ~ (index_retention | int) // 4 ~ ' days',
|
||||
'options': {
|
||||
"disable_action": false,
|
||||
"ignore_empty_list": true,
|
||||
"shrink_node": "DETERMINISTIC",
|
||||
"node_filters": {
|
||||
"permit_masters": ((master_nodes | length) < (data_nodes | length)) | ternary(true, false),
|
||||
"exclude_nodes": (groups['kibana'] | map('extract', hostvars, 'ansible_host') | list)
|
||||
},
|
||||
"number_of_shards": 1,
|
||||
"number_of_replicas": 1,
|
||||
"shrink_suffix": '-shrink',
|
||||
"copy_aliases": true,
|
||||
"delete_after": true,
|
||||
"post_allocation": {
|
||||
"allocation_type": "include",
|
||||
"key": "node_tag",
|
||||
"value": "cold"
|
||||
},
|
||||
"wait_for_active_shards": 1,
|
||||
"extra_settings": {
|
||||
"settings": {
|
||||
"index.codec": "best_compression"
|
||||
}
|
||||
},
|
||||
"wait_for_completion": true,
|
||||
"wait_for_rebalance": true,
|
||||
"wait_interval": 9,
|
||||
"max_wait": -1
|
||||
}
|
||||
}
|
||||
)
|
||||
-%}
|
||||
{% set filters = [] -%}
|
||||
{% set _ = filters.append(
|
||||
{
|
||||
'filtertype': 'pattern',
|
||||
'kind': 'prefix',
|
||||
'value': key + '-'
|
||||
}
|
||||
)
|
||||
-%}
|
||||
{% set _ = filters.append(
|
||||
{
|
||||
'filtertype': 'age',
|
||||
'source': 'creation_date',
|
||||
'direction': 'older',
|
||||
'unit': 'days',
|
||||
'unit_count': (index_retention | int) // 4
|
||||
}
|
||||
)
|
||||
-%}
|
||||
{% set _ = shrink_indices.update({'filters': filters}) -%}
|
||||
{% set _ = action_items.append(shrink_indices) -%}
|
||||
{% endfor -%}
|
||||
|
||||
{% set actions = {} -%}
|
||||
{% for action_item in action_items -%}
|
||||
{% set _ = actions.update({loop.index: action_item}) -%}
|
||||
{% endfor -%}
|
||||
|
||||
{# Render all actions #}
|
||||
{% set curator_actions = {'actions': actions} -%}
|
||||
{{ curator_actions | to_nice_yaml(indent=2) }}
|
||||
|
@ -28,7 +28,7 @@ appender.rolling.strategy.action.basepath = ${sys:es.logs.base_path}
|
||||
appender.rolling.strategy.action.condition.type = IfFileName
|
||||
appender.rolling.strategy.action.condition.glob = ${sys:es.logs.cluster_name}-*
|
||||
appender.rolling.strategy.action.condition.nested_condition.type = IfLastModified
|
||||
appender.rolling.strategy.action.condition.nested_condition.age = {{ elastic_logstash_retention }}D
|
||||
appender.rolling.strategy.action.condition.nested_condition.age = {{ elastic_logstash_retention | default(1) }}D
|
||||
|
||||
|
||||
rootLogger.level = info
|
||||
|
@ -1011,8 +1011,11 @@ filebeat.prospectors:
|
||||
# Make sure not file is defined twice as this can lead to unexpected behaviour.
|
||||
paths:
|
||||
- /var/log/beats/*.log
|
||||
- /openstack/log/*/beats/*.log
|
||||
- /var/log/curator/curator
|
||||
- /openstack/log/*/curator/curator
|
||||
- /var/log/elasticsearch/*.log
|
||||
- /openstack/log/*/elasticsearch/*.log
|
||||
|
||||
# Optional additional fields. These fields can be freely picked
|
||||
# to add additional information to the crawled log files for filtering
|
||||
|
@ -13,16 +13,39 @@ elastic_hap_port: 9201
|
||||
cluster_name: openstack_elk
|
||||
node_name: ${HOSTNAME}
|
||||
|
||||
# elastic curator vars
|
||||
# all retention options are in days
|
||||
elastic_logstash_retention: 14
|
||||
elastic_apm_retention: 3
|
||||
elastic_auditbeat_retention: 7
|
||||
elastic_filebeat_retention: 7
|
||||
elastic_heartbeat_retention: 7
|
||||
elastic_journalbeat_retention: 14
|
||||
elastic_metricbeat_retention: 3
|
||||
elastic_packetbeat_retention: 3
|
||||
### Elastic curator variables
|
||||
## Default retention policy options. All retention options are in days.
|
||||
# elastic_logstash_retention: 1
|
||||
# elastic_apm_retention: 1
|
||||
# elastic_auditbeat_retention: 1
|
||||
# elastic_filebeat_retention: 1
|
||||
# elastic_heartbeat_retention: 1
|
||||
# elastic_journalbeat_retention: 1
|
||||
# elastic_metricbeat_retention: 1
|
||||
# elastic_packetbeat_retention: 1
|
||||
|
||||
# This is used to calculate the storage a beat could generate per node, per day.
|
||||
# This constant is used as a multiplier. If the expected storage is larger than
|
||||
# the actual available storage after the buffer is calculated the multiplier
|
||||
# will be doubled there-by cutting the potential storage days in half.
|
||||
elastic_beat_storage_constant: 512
|
||||
|
||||
## If any retention policy option is undefined a dynamic fact will be generated.
|
||||
## Fact will be generated for the general retention using the storage constant
|
||||
## per node, per index, where a given collector is expected to be deployed. The
|
||||
## equation used will take the total available storage from the ES data nodes
|
||||
## subtract 25% divided by the total number of data nodes. That is then divided
|
||||
## by number of hosts assumed to be a beat target which is multiplied by the
|
||||
## storage constant.
|
||||
elastic_beat_retention_policy_hosts:
|
||||
logstash: "{{ groups['elastic-logstash'] | default([null]) | length }}"
|
||||
apm: "{{ groups['apm-server'] | default([null]) | length }}"
|
||||
auditbeat: "{{ groups['hosts'] | default([null]) | length }}"
|
||||
filebeat: "{{ groups['hosts'] | default([null]) | length }}"
|
||||
heartbeat: "{{ groups['utility_all'] | default([null]) | length }}"
|
||||
journalbeat: "{{ groups['all'] | default([null]) | length }}"
|
||||
metricbeat: "{{ groups['all'] | default([null]) | length }}"
|
||||
packetbeat: "{{ groups['hosts'] | default([null]) | length }}"
|
||||
|
||||
# This is the URL external services can use to communicate with the
|
||||
# elasticsearch cluster.
|
||||
|
Loading…
x
Reference in New Issue
Block a user