From 6cd74330869af6b000366bcb4ffdce7dd3d3c1b1 Mon Sep 17 00:00:00 2001
From: Ian Wienand <iwienand@redhat.com>
Date: Tue, 28 Jun 2022 15:12:05 +1000
Subject: [PATCH] graphite: fix xFilesFactor

When we migrated this to ansible I missed that we didn't bring across
the storage-aggregation.conf file.

This has had the unfortunate effect of regressing the xFilesFactor set
for every newly created graphite stat since the migration.  This
setting is a percentage (0-1 float) of how much of a "bucket" needs to
be non-null to keep the value when rolling up changes.  We want this
to be zero due to the sporadic nature of data (see the original change
I5f416e798e7abedfde776c9571b6fc8cea5f3a33).

This only affected newly created statistics, as graphite doesn't
modify this setting once it creates the whisper file.  This probably
helped us overlook this for so long, as longer-existing stats were
operating correctly, but newer were dropping data when zoomed out.

Restore this setting, and double-check it in testinfra for the future.
For simplicity and to get this back to the prior state I will manually
update the on-disk .wsp files to this when this change applies.

Change-Id: I57873403c4ca9783b1851ba83bfba038f4b90715
---
 playbooks/roles/graphite/tasks/main.yaml      | 19 +++++++++---
 .../graphite/templates/docker-compose.yaml.j2 |  9 +++---
 .../templates/storage-aggregation.conf.j2     | 29 +++++++++++++++++++
 testinfra/test_graphite.py                    | 15 ++++++++++
 zuul.d/system-config-run.yaml                 |  1 +
 5 files changed, 65 insertions(+), 8 deletions(-)
 create mode 100644 playbooks/roles/graphite/templates/storage-aggregation.conf.j2

diff --git a/playbooks/roles/graphite/tasks/main.yaml b/playbooks/roles/graphite/tasks/main.yaml
index e78aea1ec0..c8cc41e58a 100644
--- a/playbooks/roles/graphite/tasks/main.yaml
+++ b/playbooks/roles/graphite/tasks/main.yaml
@@ -1,7 +1,11 @@
-- name: Ensure docker-compose directory exists
+- name: Ensure docker-compose directories exists
   file:
     state: directory
-    path: /etc/graphite-docker
+    path: '{{ item }}'
+  loop:
+    - /etc/graphite-docker
+    - /etc/graphite-docker/graphite
+    - /etc/graphite-docker/graphite/conf
 
 - name: Write settings file
   template:
@@ -11,7 +15,12 @@
 - name: Write storage config file
   template:
     src: storage-schemas.conf.j2
-    dest: /etc/graphite-docker/storage-schemas.conf
+    dest: /etc/graphite-docker/graphite/conf/storage-schemas.conf
+
+- name: Write storage aggregation config file
+  template:
+    src: storage-aggregation.conf.j2
+    dest: /etc/graphite-docker/graphite/conf/storage-aggregation.conf
 
 - name: Write nginx override config
   template:
@@ -47,11 +56,13 @@
   shell:
     cmd: docker image prune -f
 
-# This is handy to have on the host for checking stat ingestion
+# net is handy to have on the host for checking stat ingestion
+# whisper utils are handy for inspecting .wsp files for debugging
 - name: Install netcat
   package:
     name:
       - netcat
+      - python3-whisper
     state: present
 
 # Removes files not updated for ~9 months, and clears out empty directories
diff --git a/playbooks/roles/graphite/templates/docker-compose.yaml.j2 b/playbooks/roles/graphite/templates/docker-compose.yaml.j2
index 75e107e83c..34fdb9258b 100644
--- a/playbooks/roles/graphite/templates/docker-compose.yaml.j2
+++ b/playbooks/roles/graphite/templates/docker-compose.yaml.j2
@@ -8,9 +8,10 @@ services:
     image: docker.io/graphiteapp/graphite-statsd
     network_mode: host
     volumes:
-      - /etc/graphite-docker/graphite-statsd.conf:/etc/nginx/sites-enabled/graphite-statsd.conf
-      - /etc/graphite-docker/statsd.js:/opt/statsd/config/udp.js
-      - /etc/graphite-docker/storage-schemas.conf:/opt/graphite/conf/storage-schemas.conf
-      - /etc/letsencrypt-certs:/etc/letsencrypt-certs
+      - /etc/graphite-docker/graphite-statsd.conf:/etc/nginx/sites-enabled/graphite-statsd.conf:ro
+      - /etc/graphite-docker/statsd.js:/opt/statsd/config/udp.js:ro
+      - /etc/graphite-docker/graphite/conf/storage-schemas.conf:/opt/graphite/conf/storage-schemas.conf:ro
+      - /etc/graphite-docker/graphite/conf/storage-aggregation.conf:/opt/graphite/conf/storage-aggregation.conf:ro
+      - /etc/letsencrypt-certs:/etc/letsencrypt-certs:ro
       - /opt/graphite/storage:/opt/graphite/storage
       - /var/log/graphite:/var/log/
diff --git a/playbooks/roles/graphite/templates/storage-aggregation.conf.j2 b/playbooks/roles/graphite/templates/storage-aggregation.conf.j2
new file mode 100644
index 0000000000..bf60170a1f
--- /dev/null
+++ b/playbooks/roles/graphite/templates/storage-aggregation.conf.j2
@@ -0,0 +1,29 @@
+[min]
+pattern = \.lower$
+xFilesFactor = 0
+aggregationMethod = min
+
+[max]
+pattern = \.upper(_\d+)?$
+xFilesFactor = 0
+aggregationMethod = max
+
+[sum]
+pattern = \.sum$
+xFilesFactor = 0
+aggregationMethod = sum
+
+[count]
+pattern = \.count$
+xFilesFactor = 0
+aggregationMethod = sum
+
+[count_legacy]
+pattern = ^stats_counts.*
+xFilesFactor = 0
+aggregationMethod = sum
+
+[default_average]
+pattern = .*
+xFilesFactor = 0
+aggregationMethod = average
diff --git a/testinfra/test_graphite.py b/testinfra/test_graphite.py
index 351187943b..fb0a18ba4c 100644
--- a/testinfra/test_graphite.py
+++ b/testinfra/test_graphite.py
@@ -56,3 +56,18 @@ def test_graphite_data(host):
                 found_value = True
 
     assert found_value
+
+def test_graphite_wsp(host):
+    # seed some timer data
+    cmd = ('timeout 20 bash -c '
+           '\'while true; do echo -n "example:$((RANDOM % 1000))|ms" '
+           '| nc -6 -w 1 -u localhost 8125; done\'')
+    host.run(cmd)
+
+    wsp_file = '/opt/graphite/storage/whisper/stats/timers/example/mean.wsp'
+
+    wsp = host.file(wsp_file)
+    assert wsp.exists
+
+    cmd = host.run('whisper-info %s' % wsp_file)
+    assert 'xFilesFactor: 0.0' in cmd.stdout
diff --git a/zuul.d/system-config-run.yaml b/zuul.d/system-config-run.yaml
index 80863fbc6a..e739cb057f 100644
--- a/zuul.d/system-config-run.yaml
+++ b/zuul.d/system-config-run.yaml
@@ -692,6 +692,7 @@
       graphite02.opendev.org:
         host_copy_output:
           '/var/log/graphite': logs
+          '/etc/graphite-docker': logs
     files:
       - playbooks/bootstrap-bridge.yaml
       - playbooks/letsencrypt.yaml