system-config/playbooks/periodic/404.yaml

- hosts: localhost
  tasks:
    - name: Add static.opendev.org to inventory
      add_host:
        name: static.opendev.org
        ansible_connection: ssh
        ansible_host: static.opendev.org
        ansible_port: 22
        ansible_user: zuul

    - name: Add static.opendev.org host key
      known_hosts:
        name: static.opendev.org
        key: static.opendev.org,23.253.245.150,2001:4800:7818:101:be76:4eff:fe04:7c28 ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBMu3PnnkNhPS2d5Z2uPju3Qqcbbc0lwHA1j9MgHlLnbK3bx1O2Kfez6RJUGl2i6nshdzkKwPBvN2vehQKiw1oSk=

# NOTE(ianw): 2020-02-25 just for initial testing run this for one log
# in a dumb way.  We can scrape a few more sites.  Overall, we expect
# this to be replaced with a better analysis tool, see
#   https://review.opendev.org/709236
- hosts: static.opendev.org
  tasks:
    - name: Run 404 scraping script
      become: yes
      shell: |
        SOURCE_FILE=/var/log/apache2/docs.openstack.org_access.log
        INTERMEDIATE_FILE=$(mktemp)

        # Get just the lines with 404s in them
        grep ' 404 ' $SOURCE_FILE | sed -n -e 's/.*"GET \(\/.*\) HTTP\/1\.." 404 .*/\1/p' > $INTERMEDIATE_FILE

        if [ -f "$SOURCE_FILE.1" ] ; then
        # We get roughly the last days worth of logs by looking at the last two
        # log files.
        grep ' 404 ' $SOURCE_FILE.1 | sed -n -e 's/.*"GET \(\/.*\) HTTP\/1\.." 404 .*/\1/p' >> $INTERMEDIATE_FILE
        fi

        # Process those 404s to count them and return sorted by count
        sort $INTERMEDIATE_FILE | uniq -c | sort -rn | grep '\(html\|\/$\)'

        rm ${INTERMEDIATE_FILE}
      args:
        executable: /bin/bash