#!/usr/bin/env bash set -eo pipefail PAGES_URL="${PAGES_URL:-http://localhost:3000}" PAGES_HOST="${PAGES_HOST:?PAGES_HOST is required}" CONFIG="${RETENTION_CONFIG:-/etc/retention/retention.json}" GITEA_API_URL="${GITEA_API_URL:-}" GITEA_TOKEN="${GITEA_TOKEN:-}" curl_with_host() { curl -sS -H "Host: ${PAGES_HOST}" "$@" } [ -f "$CONFIG" ] || { echo "ERROR: config missing: $CONFIG" >&2; exit 1; } declare -A BRANCH_CACHE branch_exists() { local owner="$1" repo="$2" branch="$3" key="${owner}/${repo}/${branch}" local status attempt [ -z "$GITEA_API_URL" ] && return 0 [ -z "$GITEA_TOKEN" ] && return 0 if [ "${BRANCH_CACHE[$key]:-}" = "1" ]; then return 0 fi # Retry up to 2 times on API errors (hardcoded) for attempt in 1 2 3; do status=$(curl -sS -o /dev/null -w "%{http_code}" \ -H "Authorization: token ${GITEA_TOKEN}" \ "${GITEA_API_URL}/api/v1/repos/${owner}/${repo}/branches/${branch}" 2>/dev/null || echo "000") if [ "$status" = "200" ]; then BRANCH_CACHE[$key]=1 return 0 fi if [ "$status" = "404" ]; then return 1 fi # API error - retry if not last attempt if [ "$attempt" -lt 3 ]; then sleep 10 continue fi done # All retries failed - keep report (fail-safe) echo " WARN: Gitea API error for ${owner}/${repo}/${branch} (status ${status}) after 3 attempts - KEEPING report" BRANCH_CACHE[$key]=1 return 0 } default_max_age=$(jq -r '.branches.default.maxAgeDays // 90' "$CONFIG") default_keep_min=$(jq -r '.branches.default.keepMin // 5' "$CONFIG") rule_max_age() { local branch="$1" v v=$(jq -r --arg b "$branch" '.branches[$b].maxAgeDays // empty' "$CONFIG") [ -n "$v" ] && echo "$v" || echo "$default_max_age" } rule_keep_min() { local branch="$1" v v=$(jq -r --arg b "$branch" '.branches[$b].keepMin // empty' "$CONFIG") [ -n "$v" ] && echo "$v" || echo "$default_keep_min" } age_days() { local published="$1" epoch_pub now epoch_pub=$(date -u -d "$published" +%s 2>/dev/null || echo 0) [ "$epoch_pub" -eq 0 ] && echo 99999 && return now=$(date -u +%s) echo $(( (now - epoch_pub) / 86400 )) } parse_path() { local rel="$1" OWNER="${rel%%/*}" rest="${rel#*/}" REPO="${rest%%/*}" } echo "Fetching manifest from ${PAGES_URL}/.git-pages/manifest.json" MANIFEST=$(curl_with_host "${PAGES_URL}/.git-pages/manifest.json") echo "Manifest loaded" META_PATHS=$(echo "$MANIFEST" | jq -r '.contents | to_entries[] | select(.key | test("/reports/")) | select(.key | endswith("/.meta")) | .key' 2>/dev/null || true) if [ -z "$META_PATHS" ]; then echo "No .meta files found under /reports/ — nothing to clean" exit 0 fi echo "" echo "=== Phase 1: collect reports ===" declare -A SEEN_REPORTS declare -a REPORTS while IFS= read -r meta_path; do report_dir=$(dirname "$meta_path") # Skip duplicates - same report dir already processed [ -z "${SEEN_REPORTS[$report_dir]:-}" ] || continue SEEN_REPORTS[$report_dir]=1 parse_path "$report_dir" meta_content=$(curl_with_host "${PAGES_URL}/${meta_path}" 2>/dev/null || true) [ -n "$meta_content" ] || { echo " WARN: could not fetch $meta_path"; continue; } branch=$(echo "$meta_content" | jq -r '.branch // empty' 2>/dev/null || true) published=$(echo "$meta_content" | jq -r '.published_at // empty' 2>/dev/null || true) [ -n "$branch" ] || { echo " WARN: no branch in $meta_path"; continue; } [ -n "$published" ] || { echo " WARN: no published_at in $meta_path"; continue; } days=$(age_days "$published") REPORTS+=("${report_dir}|${OWNER}|${REPO}|${branch}|${days}") echo " ${OWNER}/${REPO} branch=${branch} age=${days}d" done <<< "$META_PATHS" [ "${#REPORTS[@]}" -eq 0 ] && { echo "No actionable reports"; exit 0; } echo "" echo "=== Phase 2: check branches in Gitea ===" declare -a TO_DELETE declare -a KEEP for entry in "${REPORTS[@]}"; do IFS='|' read -r dir owner repo branch days <<< "$entry" if [ -n "$GITEA_API_URL" ] && [ -n "$GITEA_TOKEN" ]; then if branch_exists "$owner" "$repo" "$branch"; then echo " BRANCH EXISTS: ${owner}/${repo}/${branch}" KEEP+=("${dir}|${owner}|${repo}|${branch}|${days}") else echo " BRANCH DELETED: ${owner}/${repo}/${branch} -> DELETE" TO_DELETE+=("$dir") fi else KEEP+=("${dir}|${owner}|${repo}|${branch}|${days}") fi done echo "" echo "=== Phase 3: apply retention rules to remaining reports ===" declare -A BRANCH_COUNTS if [ "${#KEEP[@]}" -gt 0 ]; then IFS=$'\n' for entry in $(printf '%s\n' "${KEEP[@]}" | sort -t'|' -k4,4 -k5,5rn); do IFS='|' read -r dir owner repo branch days <<< "$entry" max_age=$(rule_max_age "$branch") keep_min=$(rule_keep_min "$branch") if [ "$days" -gt "$max_age" ]; then echo " DELETE: ${dir} (age ${days}d > maxAge ${max_age}d, branch ${branch})" TO_DELETE+=("$dir") continue fi key="${branch}" count="${BRANCH_COUNTS[$key]:-0}" count=$((count + 1)) BRANCH_COUNTS["$key"]=$count if [ "$count" -gt "$keep_min" ]; then echo " DELETE: ${dir} (kept ${keep_min}/${count}, exceeds keepMin, branch ${branch})" TO_DELETE+=("$dir") fi done unset IFS fi if [ "${#TO_DELETE[@]}" -eq 0 ]; then echo "Nothing to delete" exit 0 fi echo "" echo "=== Phase 4: full site rebuild ===" echo "Rebuilding site (${#TO_DELETE[@]} report(s) to delete)..." ARCHIVE_FILE=$(mktemp) SITE_DIR=$(mktemp -d) NEW_TAR=$(mktemp) cleanup_phase4() { rm -f "$ARCHIVE_FILE" "$NEW_TAR" rm -rf "$SITE_DIR" } trap cleanup_phase4 EXIT # Try archive.tar first echo "Downloading archive.tar..." HTTP_CODE=$(curl_with_host -o "$ARCHIVE_FILE" -w "%{http_code}" -sS "${PAGES_URL}/.git-pages/archive.tar") if [ "$HTTP_CODE" = "200" ] && tar -tf "$ARCHIVE_FILE" >/dev/null 2>&1; then echo "Extracting archive..." tar -xf "$ARCHIVE_FILE" -C "$SITE_DIR" for dir in "${TO_DELETE[@]}"; do if [ -d "$SITE_DIR/$dir" ]; then echo " Removing: $dir" rm -rf "$SITE_DIR/$dir" fi done else echo "archive.tar failed (HTTP ${HTTP_CODE}) - falling back to manifest-based rebuild" ALL_PATHS=$(echo "$MANIFEST" | jq -r '.contents | keys[]' 2>/dev/null || true) if [ -z "$ALL_PATHS" ]; then echo "ERROR: no files in manifest - cannot rebuild" >&2 exit 1 fi EXCLUDE_GREP="" for dir in "${TO_DELETE[@]}"; do EXCLUDE_GREP="${EXCLUDE_GREP}${EXCLUDE_GREP:+|}^${dir}/" done if [ -n "$EXCLUDE_GREP" ]; then KEEP_PATHS=$(echo "$ALL_PATHS" | grep -v -E "$EXCLUDE_GREP" || true) else KEEP_PATHS="$ALL_PATHS" fi if [ -z "$KEEP_PATHS" ]; then echo "No files to keep - site will be empty" mkdir -p "$SITE_DIR/__placeholder__" echo "placeholder" > "$SITE_DIR/__placeholder__/index.html" else FILE_COUNT=$(echo "$KEEP_PATHS" | wc -l | tr -d ' ') echo "Downloading ${FILE_COUNT} file(s)..." while IFS= read -r path; do [ -z "$path" ] && continue dir=$(dirname "$SITE_DIR/$path") mkdir -p "$dir" curl_with_host -o "$SITE_DIR/$path" -sS "${PAGES_URL}/${path}" || { echo " WARN: failed to download ${path}" } done <<< "$KEEP_PATHS" fi fi if [ -z "$(ls -A "$SITE_DIR" 2>/dev/null)" ]; then echo "Site is empty - creating placeholder" mkdir -p "$SITE_DIR/__placeholder__" echo "placeholder" > "$SITE_DIR/__placeholder__/index.html" fi tar -cf "$NEW_TAR" -C "$SITE_DIR" . echo "PUT: replacing site contents..." HTTP_CODE=$(curl_with_host -X PUT "${PAGES_URL}/" \ -H "Content-Type: application/x-tar" \ --data-binary @"${NEW_TAR}" \ -w "%{http_code}" \ -o /dev/null) echo "HTTP ${HTTP_CODE}" if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "201" ] || [ "$HTTP_CODE" = "204" ]; then echo "Site rebuild completed." else echo "ERROR: PUT HTTP ${HTTP_CODE}" >&2 exit 1 fi