#!/usr/bin/env bash
# single-pass apache log analyzer for .gz files
# outputs: year,total_hits,unique_ips
# counts only log lines containing both: socware and .zip

set -euo pipefail

DIR="${1:-.}"

find "$DIR" -type f -name '*.gz' -print0 | \
xargs -0 gzip -dc 2>/dev/null | \
grep 'socware' | \
grep '\.zip' | \
awk -F'[][]' '
{
  ip = $1
  gsub(/[[:space:]]+$/, "", ip)
  gsub(/^[[:space:]]+/, "", ip)

  ts = $2
  if (ts == "") next

  split(ts, a, ":")
  date_part = a[1]

  split(date_part, b, "/")
  year = b[3]
  if (year == "" || year !~ /^[0-9]{4}$/) next

  hits[year]++

  key = year "\t" ip
  if (!(key in seen)) {
    seen[key] = 1
    uniq[year]++
  }
}
END {
  print "year,total_hits,unique_ips"

  n = 0
  for (y in hits) {
    years[++n] = y
  }

  for (i = 1; i <= n; i++) {
    for (j = i + 1; j <= n; j++) {
      if (years[i] + 0 > years[j] + 0) {
        tmp = years[i]
        years[i] = years[j]
        years[j] = tmp
      }
    }
  }

  for (i = 1; i <= n; i++) {
    y = years[i]
    printf "%s,%d,%d\n", y, hits[y], uniq[y]+0
  }
}
'
