From b6a9a6fc7a69ae0bbf47f285d3d1323bf3cdecc6 Mon Sep 17 00:00:00 2001 From: Kevin Carter Date: Mon, 9 Jul 2018 13:58:23 -0500 Subject: [PATCH] Add dynamic retention policies to curator The curator retention policies will now query the storage nodes within a given deployment and set a suitable index retention policy based on the total amount of storage each index is assumed to produce every day. To ensure we're minimizing the storage required and optimizing search performance several actions are now being taken: * Indexes will be shrunk after a quarter of their retention time. * Indexes will be deleted should they exceed the retention time. Change-Id: I8bf548620b5404d25deaadba8fda93452ef64fa0 Signed-off-by: Kevin Carter --- elk_metrics_6x/createElasticIndexes.yml | 4 + elk_metrics_6x/installCurator.yml | 42 +++ elk_metrics_6x/site-elka.yml | 2 +- .../templates/curator-actions.yml.j2 | 262 ++++++++---------- .../templates/es-log4j2.properties.j2 | 2 +- elk_metrics_6x/templates/filebeat.yml.j2 | 3 + elk_metrics_6x/vars/variables.yml | 43 ++- 7 files changed, 193 insertions(+), 165 deletions(-) diff --git a/elk_metrics_6x/createElasticIndexes.yml b/elk_metrics_6x/createElasticIndexes.yml index 7e4528b7..c02a8463 100644 --- a/elk_metrics_6x/createElasticIndexes.yml +++ b/elk_metrics_6x/createElasticIndexes.yml @@ -16,6 +16,10 @@ body: "{{ item.index_options | to_json }}" status_code: 200,400 body_format: json + register: elk_indexes + until: elk_indexes is success + retries: 3 + delay: 5 with_items: - name: "osprofiler-notifications" index_options: diff --git a/elk_metrics_6x/installCurator.yml b/elk_metrics_6x/installCurator.yml index 5294bcb8..7573d02b 100644 --- a/elk_metrics_6x/installCurator.yml +++ b/elk_metrics_6x/installCurator.yml @@ -16,6 +16,7 @@ - name: Install Curator hosts: "elastic-logstash" become: true + gather_facts: false vars: haproxy_ssl: false @@ -25,6 +26,47 @@ environment: "{{ deployment_environment_variables | default({}) }}" pre_tasks: + - include_tasks: common_task_data_node_hosts.yml + tags: + - always + + - name: Query es storage + uri: + url: "http://127.0.0.1:9200/_nodes/{{ (data_nodes | map('extract', hostvars, 'ansible_host') | list) | join(',') }}/stats/fs" + method: GET + register: elk_data + until: elk_data is success + retries: 3 + delay: 5 + run_once: true + + - name: Set available storage fact + set_fact: + es_total_available_storage: "{{ ((elk_data['json']['nodes'].values() | list) | map(attribute='fs.total.total_in_bytes') | list | sum) // 1024 // 1024 }}" + + - name: Set assumed buffer storage fact + set_fact: + es_assumed_buffer_storage: "{{ ((es_total_available_storage | int) * 0.25) | round | int }}" + + - name: Set usable buffer storage fact(s) + set_fact: + es_usable_buffer_storage: "{{ (es_total_available_storage | int) - (es_assumed_buffer_storage | int) }}" + es_expected_storage: "{{ ((elastic_beat_retention_policy_hosts.values() | map('int') | list) | sum) * (elastic_beat_storage_constant | int) }}" + + - name: Set buffer storage fact + set_fact: + es_assumed_usable_storage_per_node: "{{ (es_usable_buffer_storage | int) // (data_nodes | length | int) }}" + + - name: Set storage the mulitplier + set_fact: + es_storage_multiplier: "{{ ((es_usable_buffer_storage | int) < (es_expected_storage | int)) | ternary(((elastic_beat_storage_constant | int) * 2), elastic_beat_storage_constant | int) }}" + + - name: Set retention facts + set_fact: "elastic_{{ item.key }}_retention={{ (es_assumed_usable_storage_per_node | int) // ((item.value | int) * (es_storage_multiplier | int)) }}" + when: + - hostvars[inventory_hostname]["elastic_" + item.key + "_retention"] is undefined + with_dict: "{{ elastic_beat_retention_policy_hosts }}" + - name: Ensure virtualenv is installed apt: name: "{{ item }}" diff --git a/elk_metrics_6x/site-elka.yml b/elk_metrics_6x/site-elka.yml index f2e33ede..423a9342 100644 --- a/elk_metrics_6x/site-elka.yml +++ b/elk_metrics_6x/site-elka.yml @@ -14,8 +14,8 @@ # limitations under the License. - import_playbook: installElastic.yml -- import_playbook: installCurator.yml - import_playbook: installLogstash.yml +- import_playbook: installCurator.yml - import_playbook: installKibana.yml - import_playbook: installAPMserver.yml - import_playbook: createElasticIndexes.yml diff --git a/elk_metrics_6x/templates/curator-actions.yml.j2 b/elk_metrics_6x/templates/curator-actions.yml.j2 index e8791b14..42667be5 100644 --- a/elk_metrics_6x/templates/curator-actions.yml.j2 +++ b/elk_metrics_6x/templates/curator-actions.yml.j2 @@ -13,156 +13,112 @@ # See the License for the specific language governing permissions and # limitations under the License. -actions: - 1: - action: delete_indices - description: >- - Delete indices older than 60 days (based on index name), for logstash- - prefixed indices. Ignore the error if the filter does not result in an - actionable list of indices (ignore_empty_list) and exit cleanly. - options: - ignore_empty_list: True - disable_action: False - filters: - - filtertype: pattern - kind: prefix - value: logstash- - - filtertype: age - source: name - direction: older - timestring: '%Y.%m.%d' - unit: days - unit_count: {{ elastic_logstash_retention }} - 2: - action: delete_indices - description: >- - Delete indices older than 10 days (based on index name), for apm- - prefixed indices. Ignore the error if the filter does not result in an - actionable list of indices (ignore_empty_list) and exit cleanly. - options: - ignore_empty_list: True - disable_action: False - filters: - - filtertype: pattern - kind: prefix - value: apm- - - filtertype: age - source: name - direction: older - timestring: '%Y.%m.%d' - unit: days - unit_count: {{ elastic_apm_retention }} - 3: - action: delete_indices - description: >- - Delete indices older than 15 days (based on index name), for auditbeat- - prefixed indices. Ignore the error if the filter does not result in an - actionable list of indices (ignore_empty_list) and exit cleanly. - options: - ignore_empty_list: True - disable_action: False - filters: - - filtertype: pattern - kind: prefix - value: auditbeat- - - filtertype: age - source: name - direction: older - timestring: '%Y.%m.%d' - unit: days - unit_count: {{ elastic_auditbeat_retention }} - 4: - action: delete_indices - description: >- - Delete indices older than 15 days (based on index name), for filebeat- - prefixed indices. Ignore the error if the filter does not result in an - actionable list of indices (ignore_empty_list) and exit cleanly. - options: - ignore_empty_list: True - disable_action: False - filters: - - filtertype: pattern - kind: prefix - value: filebeat- - - filtertype: age - source: name - direction: older - timestring: '%Y.%m.%d' - unit: days - unit_count: {{ elastic_filebeat_retention }} - 5: - action: delete_indices - description: >- - Delete indices older than 10 days (based on index name), for heartbeat- - prefixed indices. Ignore the error if the filter does not result in an - actionable list of indices (ignore_empty_list) and exit cleanly. - options: - ignore_empty_list: True - disable_action: False - filters: - - filtertype: pattern - kind: prefix - value: heartbeat- - - filtertype: age - source: name - direction: older - timestring: '%Y.%m.%d' - unit: days - unit_count: {{ elastic_heartbeat_retention }} - 6: - action: delete_indices - description: >- - Delete indices older than 15 days (based on index name), for journalbeat- - prefixed indices. Ignore the error if the filter does not result in an - actionable list of indices (ignore_empty_list) and exit cleanly. - options: - ignore_empty_list: True - disable_action: False - filters: - - filtertype: pattern - kind: prefix - value: journalbeat- - - filtertype: age - source: name - direction: older - timestring: '%Y.%m.%d' - unit: days - unit_count: {{ elastic_journalbeat_retention }} - 7: - action: delete_indices - description: >- - Delete indices older than 10 days (based on index name), for metricbeat- - prefixed indices. Ignore the error if the filter does not result in an - actionable list of indices (ignore_empty_list) and exit cleanly. - options: - ignore_empty_list: True - disable_action: False - filters: - - filtertype: pattern - kind: prefix - value: metricbeat- - - filtertype: age - source: name - direction: older - timestring: '%Y.%m.%d' - unit: days - unit_count: {{ elastic_metricbeat_retention }} - 8: - action: delete_indices - description: >- - Delete indices older than 5 days (based on index name), for packetbeat- - prefixed indices. Ignore the error if the filter does not result in an - actionable list of indices (ignore_empty_list) and exit cleanly. - options: - ignore_empty_list: True - disable_action: False - filters: - - filtertype: pattern - kind: prefix - value: packetbeat- - - filtertype: age - source: name - direction: older - timestring: '%Y.%m.%d' - unit: days - unit_count: {{ elastic_packetbeat_retention }} +{% set action_items = [] -%} +{# Delete index loop #} +{% for key in elastic_beat_retention_policy_hosts.keys() -%} +{% set delete_indices = {} -%} +{% set index_retention = hostvars[inventory_hostname]['elastic_' + key + '_retention'] -%} +{% set _ = delete_indices.update( + { + 'action': 'delete_indices', + 'description': 'Prune indices for ' + key + ' after ' ~ ((index_retention | int) * 2) ~ ' days.', + 'options': { + 'ignore_empty_list': true, + 'disable_action': false + } + } + ) +-%} +{# add the filter loop #} +{% set filters = [] -%} +{% set _ = filters.append( + { + 'filtertype': 'pattern', + 'kind': 'prefix', + 'value': key + '-' + } + ) +-%} +{% set _ = filters.append( + { + 'filtertype': 'age', + 'source': 'name', + 'direction': 'older', + 'timestring': '%Y.%m.%d', + 'unit': 'days', + 'unit_count': (index_retention | int) + } + ) +-%} +{% set _ = delete_indices.update({'filters': filters}) -%} +{% set _ = action_items.append(delete_indices) -%} + +{# Set shrink curator options #} +{% set shrink_indices = {} -%} +{% set _ = shrink_indices.update( + { + 'action': 'shrink', + 'description': 'Shrink ' + key + ' indices older than ' ~ (index_retention | int) // 4 ~ ' days', + 'options': { + "disable_action": false, + "ignore_empty_list": true, + "shrink_node": "DETERMINISTIC", + "node_filters": { + "permit_masters": ((master_nodes | length) < (data_nodes | length)) | ternary(true, false), + "exclude_nodes": (groups['kibana'] | map('extract', hostvars, 'ansible_host') | list) + }, + "number_of_shards": 1, + "number_of_replicas": 1, + "shrink_suffix": '-shrink', + "copy_aliases": true, + "delete_after": true, + "post_allocation": { + "allocation_type": "include", + "key": "node_tag", + "value": "cold" + }, + "wait_for_active_shards": 1, + "extra_settings": { + "settings": { + "index.codec": "best_compression" + } + }, + "wait_for_completion": true, + "wait_for_rebalance": true, + "wait_interval": 9, + "max_wait": -1 + } + } + ) +-%} +{% set filters = [] -%} +{% set _ = filters.append( + { + 'filtertype': 'pattern', + 'kind': 'prefix', + 'value': key + '-' + } + ) +-%} +{% set _ = filters.append( + { + 'filtertype': 'age', + 'source': 'creation_date', + 'direction': 'older', + 'unit': 'days', + 'unit_count': (index_retention | int) // 4 + } + ) +-%} +{% set _ = shrink_indices.update({'filters': filters}) -%} +{% set _ = action_items.append(shrink_indices) -%} +{% endfor -%} + +{% set actions = {} -%} +{% for action_item in action_items -%} +{% set _ = actions.update({loop.index: action_item}) -%} +{% endfor -%} + +{# Render all actions #} +{% set curator_actions = {'actions': actions} -%} +{{ curator_actions | to_nice_yaml(indent=2) }} diff --git a/elk_metrics_6x/templates/es-log4j2.properties.j2 b/elk_metrics_6x/templates/es-log4j2.properties.j2 index 7091d480..8a33fe3b 100644 --- a/elk_metrics_6x/templates/es-log4j2.properties.j2 +++ b/elk_metrics_6x/templates/es-log4j2.properties.j2 @@ -28,7 +28,7 @@ appender.rolling.strategy.action.basepath = ${sys:es.logs.base_path} appender.rolling.strategy.action.condition.type = IfFileName appender.rolling.strategy.action.condition.glob = ${sys:es.logs.cluster_name}-* appender.rolling.strategy.action.condition.nested_condition.type = IfLastModified -appender.rolling.strategy.action.condition.nested_condition.age = {{ elastic_logstash_retention }}D +appender.rolling.strategy.action.condition.nested_condition.age = {{ elastic_logstash_retention | default(1) }}D rootLogger.level = info diff --git a/elk_metrics_6x/templates/filebeat.yml.j2 b/elk_metrics_6x/templates/filebeat.yml.j2 index f6c95430..6812fd34 100644 --- a/elk_metrics_6x/templates/filebeat.yml.j2 +++ b/elk_metrics_6x/templates/filebeat.yml.j2 @@ -1011,8 +1011,11 @@ filebeat.prospectors: # Make sure not file is defined twice as this can lead to unexpected behaviour. paths: - /var/log/beats/*.log + - /openstack/log/*/beats/*.log - /var/log/curator/curator + - /openstack/log/*/curator/curator - /var/log/elasticsearch/*.log + - /openstack/log/*/elasticsearch/*.log # Optional additional fields. These fields can be freely picked # to add additional information to the crawled log files for filtering diff --git a/elk_metrics_6x/vars/variables.yml b/elk_metrics_6x/vars/variables.yml index ae51514f..75c88243 100644 --- a/elk_metrics_6x/vars/variables.yml +++ b/elk_metrics_6x/vars/variables.yml @@ -13,16 +13,39 @@ elastic_hap_port: 9201 cluster_name: openstack_elk node_name: ${HOSTNAME} -# elastic curator vars -# all retention options are in days -elastic_logstash_retention: 14 -elastic_apm_retention: 3 -elastic_auditbeat_retention: 7 -elastic_filebeat_retention: 7 -elastic_heartbeat_retention: 7 -elastic_journalbeat_retention: 14 -elastic_metricbeat_retention: 3 -elastic_packetbeat_retention: 3 +### Elastic curator variables +## Default retention policy options. All retention options are in days. +# elastic_logstash_retention: 1 +# elastic_apm_retention: 1 +# elastic_auditbeat_retention: 1 +# elastic_filebeat_retention: 1 +# elastic_heartbeat_retention: 1 +# elastic_journalbeat_retention: 1 +# elastic_metricbeat_retention: 1 +# elastic_packetbeat_retention: 1 + +# This is used to calculate the storage a beat could generate per node, per day. +# This constant is used as a multiplier. If the expected storage is larger than +# the actual available storage after the buffer is calculated the multiplier +# will be doubled there-by cutting the potential storage days in half. +elastic_beat_storage_constant: 512 + +## If any retention policy option is undefined a dynamic fact will be generated. +## Fact will be generated for the general retention using the storage constant +## per node, per index, where a given collector is expected to be deployed. The +## equation used will take the total available storage from the ES data nodes +## subtract 25% divided by the total number of data nodes. That is then divided +## by number of hosts assumed to be a beat target which is multiplied by the +## storage constant. +elastic_beat_retention_policy_hosts: + logstash: "{{ groups['elastic-logstash'] | default([null]) | length }}" + apm: "{{ groups['apm-server'] | default([null]) | length }}" + auditbeat: "{{ groups['hosts'] | default([null]) | length }}" + filebeat: "{{ groups['hosts'] | default([null]) | length }}" + heartbeat: "{{ groups['utility_all'] | default([null]) | length }}" + journalbeat: "{{ groups['all'] | default([null]) | length }}" + metricbeat: "{{ groups['all'] | default([null]) | length }}" + packetbeat: "{{ groups['hosts'] | default([null]) | length }}" # This is the URL external services can use to communicate with the # elasticsearch cluster.