feat(hg): revset-based author query, group discovery, one-shot ingest script

Rewrites the hg worker to use json-log?rev=author() which matches the
changeset author (not the pusher), capturing commits landed by sheriffs.
Repos are discovered within configured groups plus individually listed
repos. The worker skips entirely after the first successful backfill.

Adds script/hg-ingest.sh for offline ingestion via local hg clones —
clones one repo at a time, caches extracted changesets to .tsv, inserts
via psql, and sets poller_state when done.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-05 13:45:33 +03:00
parent 1bbe55dc84
commit 88fbbba60b
4 changed files with 284 additions and 112 deletions

141
script/hg-ingest.sh Executable file
View File

@@ -0,0 +1,141 @@
#!/usr/bin/env bash
#
# One-shot hg changeset ingestion via local clones.
#
# Bare-clones each hg repo, extracts changesets matching author terms,
# and inserts them into the moments database. Sets poller_state so the
# worker won't re-scan.
#
# Requirements: hg (mercurial), psql, jq
#
# Usage:
# DATABASE_URL="postgres://..." ./script/hg-ingest.sh
#
set -euo pipefail
DATABASE_URL="${DATABASE_URL:-postgres://moments_rw@magrathea.kosherinata.internal/moments:5432?sslmode=verify-full&sslrootcert=/etc/pki/ca-trust/source/anchors/root-internal.pem&sslcert=/etc/pki/tls/misc/$(hostname -f).pem&sslkey=/etc/pki/tls/private/$(hostname -f).pem}"
HG_HOST="${HG_HOST:-hg-edge.mozilla.org}"
WORK_DIR="${HG_WORK_DIR:-~/hg}"
# Repos to clone (groups are expanded inline)
REPOS=(
mozilla-central
integration/mozilla-inbound
integration/autoland
integration/fx-team
integration/b2g-inbound
build/puppet
build/tools
build/buildbot
build/buildbot-configs
build/slave_health
build/mozharness
build/braindump
build/cloud-tools
build/compare-locales
build/nagios-core
build/partner-repacks
build/preproduction
build/rpm-sources
build/talos
build/tupperware
build/ash-mozharness
build/autoland
build/opsi-package-sources
)
# Author terms — matched case-insensitively against changeset author fields
AUTHOR_TERMS=("rthijssen" "grenade")
: "${DATABASE_URL:?DATABASE_URL must be set}"
mkdir -p "$WORK_DIR"
total=0
CLONE_DIR="$WORK_DIR/clone"
CACHE_DIR="$WORK_DIR/cache"
mkdir -p "$CACHE_DIR"
for repo in "${REPOS[@]}"; do
cache_file="$CACHE_DIR/$(echo "$repo" | tr '/' '_').tsv"
# Skip repos already cached (re-run safe)
if [ -f "$cache_file" ]; then
echo "[hg-ingest] $repo: using cached results"
else
# Remove any previous clone to keep only one on disk
rm -rf "$CLONE_DIR"
echo "[hg-ingest] cloning $repo"
if ! hg clone --noupdate "https://$HG_HOST/$repo" "$CLONE_DIR" 2>/dev/null; then
echo "[hg-ingest] clone failed: $repo (skipping)"
continue
fi
# Build revset: author(term1) or author(term2) ...
revset=""
for term in "${AUTHOR_TERMS[@]}"; do
if [ -z "$revset" ]; then
revset="author('$term')"
else
revset="$revset or author('$term')"
fi
done
# Extract matching changesets to cache file
hg log -R "$CLONE_DIR" -r "$revset" \
--template '{node}\t{author}\t{date|hgdate}\t{desc|firstline}\n' \
> "$cache_file" 2>/dev/null || true
# Free disk immediately
rm -rf "$CLONE_DIR"
fi
# Ingest cached results into the database
count=0
while IFS=$'\t' read -r node author date_raw desc; do
[ -z "$node" ] && continue
# {date|hgdate} outputs "timestamp offset" — take just the timestamp
date_ts="${date_raw%% *}"
# Build ISO timestamp from unix epoch
occurred_at=$(date -u -d "@${date_ts}" '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || \
date -u -r "${date_ts}" '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null)
event_id="hg:${repo}:${node}"
# Build payload JSON (jq handles all escaping)
payload=$(jq -n \
--arg node "$node" \
--arg user "$author" \
--arg desc "$desc" \
--arg repo "$repo" \
--arg host "$HG_HOST" \
'{node: $node, user: $user, desc: $desc, _repo: $repo, _host: $host}')
# Upsert into events table
psql "$DATABASE_URL" -q -c "
INSERT INTO events (id, source, action, occurred_at, public, payload)
VALUES (\$\$${event_id}\$\$, 'hg', 'Commit', '${occurred_at}', true, \$\$${payload}\$\$::jsonb)
ON CONFLICT (id) DO NOTHING;
"
count=$((count + 1))
done < "$cache_file"
if [ "$count" -gt 0 ]; then
echo "[hg-ingest] $repo: $count changesets ingested"
fi
total=$((total + count))
done
# Mark poller state so the worker skips hg
psql "$DATABASE_URL" -q -c "
INSERT INTO poller_state (source, last_fetched)
VALUES ('hg', now())
ON CONFLICT (source) DO UPDATE SET last_fetched = now();
"
echo "[hg-ingest] done. total: $total changesets"