#!/usr/bin/env bash
# single-pass apache log analyzer for .gz files
# outputs: year,total_hits,unique_ips
# counts only log lines containing both: socware and .zip
# excludes obvious bots/crawlers based on User-Agent

set -euo pipefail

DIR="${1:-.}"

find "$DIR" -type f -name '*.gz' -print0 | \
xargs -0 gzip -dc 2>/dev/null | \
grep 'socware' | \
grep '\.zip' | \
awk -F'"' '
BEGIN {
  IGNORECASE = 1
}
{
  ua = $6
  if (ua ~ /(bot|crawl|spider|slurp|archive|heritrix|wget|curl|python-requests|python|scrapy|perl|java|libwww|feedfetcher)/) next

  left = $1

  if (match(left, /^([^ ]+)/, m)) ip = m[1]
  else next

  if (match(left, /\[([0-9]{2})\/[A-Za-z]{3}\/([0-9]{4}):/, t)) year = t[2]
  else next

  hits[year]++

  key = year "\t" ip
  if (!(key in seen)) {
    seen[key] = 1
    uniq[year]++
  }
}
END {
  print "year,total_hits,unique_ips"

  n = 0
  for (y in hits) {
    years[++n] = y
  }

  for (i = 1; i <= n; i++) {
    for (j = i + 1; j <= n; j++) {
      if (years[i] + 0 > years[j] + 0) {
        tmp = years[i]
        years[i] = years[j]
        years[j] = tmp
      }
    }
  }

  for (i = 1; i <= n; i++) {
    y = years[i]
    printf "%s,%d,%d\n", y, hits[y], uniq[y]+0
  }
}
'
