feat(hg): revset-based author query, group discovery, one-shot ingest script
Rewrites the hg worker to use json-log?rev=author() which matches the changeset author (not the pusher), capturing commits landed by sheriffs. Repos are discovered within configured groups plus individually listed repos. The worker skips entirely after the first successful backfill. Adds script/hg-ingest.sh for offline ingestion via local hg clones — clones one repo at a time, caches extracted changesets to .tsv, inserts via psql, and sets poller_state when done. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
141
script/hg-ingest.sh
Executable file
141
script/hg-ingest.sh
Executable file
@@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# One-shot hg changeset ingestion via local clones.
|
||||
#
|
||||
# Bare-clones each hg repo, extracts changesets matching author terms,
|
||||
# and inserts them into the moments database. Sets poller_state so the
|
||||
# worker won't re-scan.
|
||||
#
|
||||
# Requirements: hg (mercurial), psql, jq
|
||||
#
|
||||
# Usage:
|
||||
# DATABASE_URL="postgres://..." ./script/hg-ingest.sh
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
DATABASE_URL="${DATABASE_URL:-postgres://moments_rw@magrathea.kosherinata.internal/moments:5432?sslmode=verify-full&sslrootcert=/etc/pki/ca-trust/source/anchors/root-internal.pem&sslcert=/etc/pki/tls/misc/$(hostname -f).pem&sslkey=/etc/pki/tls/private/$(hostname -f).pem}"
|
||||
HG_HOST="${HG_HOST:-hg-edge.mozilla.org}"
|
||||
WORK_DIR="${HG_WORK_DIR:-~/hg}"
|
||||
|
||||
# Repos to clone (groups are expanded inline)
|
||||
REPOS=(
|
||||
mozilla-central
|
||||
integration/mozilla-inbound
|
||||
integration/autoland
|
||||
integration/fx-team
|
||||
integration/b2g-inbound
|
||||
build/puppet
|
||||
build/tools
|
||||
build/buildbot
|
||||
build/buildbot-configs
|
||||
build/slave_health
|
||||
build/mozharness
|
||||
build/braindump
|
||||
build/cloud-tools
|
||||
build/compare-locales
|
||||
build/nagios-core
|
||||
build/partner-repacks
|
||||
build/preproduction
|
||||
build/rpm-sources
|
||||
build/talos
|
||||
build/tupperware
|
||||
build/ash-mozharness
|
||||
build/autoland
|
||||
build/opsi-package-sources
|
||||
)
|
||||
|
||||
# Author terms — matched case-insensitively against changeset author fields
|
||||
AUTHOR_TERMS=("rthijssen" "grenade")
|
||||
|
||||
: "${DATABASE_URL:?DATABASE_URL must be set}"
|
||||
|
||||
mkdir -p "$WORK_DIR"
|
||||
|
||||
total=0
|
||||
|
||||
CLONE_DIR="$WORK_DIR/clone"
|
||||
CACHE_DIR="$WORK_DIR/cache"
|
||||
mkdir -p "$CACHE_DIR"
|
||||
|
||||
for repo in "${REPOS[@]}"; do
|
||||
cache_file="$CACHE_DIR/$(echo "$repo" | tr '/' '_').tsv"
|
||||
|
||||
# Skip repos already cached (re-run safe)
|
||||
if [ -f "$cache_file" ]; then
|
||||
echo "[hg-ingest] $repo: using cached results"
|
||||
else
|
||||
# Remove any previous clone to keep only one on disk
|
||||
rm -rf "$CLONE_DIR"
|
||||
|
||||
echo "[hg-ingest] cloning $repo"
|
||||
if ! hg clone --noupdate "https://$HG_HOST/$repo" "$CLONE_DIR" 2>/dev/null; then
|
||||
echo "[hg-ingest] clone failed: $repo (skipping)"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Build revset: author(term1) or author(term2) ...
|
||||
revset=""
|
||||
for term in "${AUTHOR_TERMS[@]}"; do
|
||||
if [ -z "$revset" ]; then
|
||||
revset="author('$term')"
|
||||
else
|
||||
revset="$revset or author('$term')"
|
||||
fi
|
||||
done
|
||||
|
||||
# Extract matching changesets to cache file
|
||||
hg log -R "$CLONE_DIR" -r "$revset" \
|
||||
--template '{node}\t{author}\t{date|hgdate}\t{desc|firstline}\n' \
|
||||
> "$cache_file" 2>/dev/null || true
|
||||
|
||||
# Free disk immediately
|
||||
rm -rf "$CLONE_DIR"
|
||||
fi
|
||||
|
||||
# Ingest cached results into the database
|
||||
count=0
|
||||
while IFS=$'\t' read -r node author date_raw desc; do
|
||||
[ -z "$node" ] && continue
|
||||
|
||||
# {date|hgdate} outputs "timestamp offset" — take just the timestamp
|
||||
date_ts="${date_raw%% *}"
|
||||
|
||||
# Build ISO timestamp from unix epoch
|
||||
occurred_at=$(date -u -d "@${date_ts}" '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || \
|
||||
date -u -r "${date_ts}" '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null)
|
||||
|
||||
event_id="hg:${repo}:${node}"
|
||||
|
||||
# Build payload JSON (jq handles all escaping)
|
||||
payload=$(jq -n \
|
||||
--arg node "$node" \
|
||||
--arg user "$author" \
|
||||
--arg desc "$desc" \
|
||||
--arg repo "$repo" \
|
||||
--arg host "$HG_HOST" \
|
||||
'{node: $node, user: $user, desc: $desc, _repo: $repo, _host: $host}')
|
||||
|
||||
# Upsert into events table
|
||||
psql "$DATABASE_URL" -q -c "
|
||||
INSERT INTO events (id, source, action, occurred_at, public, payload)
|
||||
VALUES (\$\$${event_id}\$\$, 'hg', 'Commit', '${occurred_at}', true, \$\$${payload}\$\$::jsonb)
|
||||
ON CONFLICT (id) DO NOTHING;
|
||||
"
|
||||
|
||||
count=$((count + 1))
|
||||
done < "$cache_file"
|
||||
|
||||
if [ "$count" -gt 0 ]; then
|
||||
echo "[hg-ingest] $repo: $count changesets ingested"
|
||||
fi
|
||||
total=$((total + count))
|
||||
done
|
||||
|
||||
# Mark poller state so the worker skips hg
|
||||
psql "$DATABASE_URL" -q -c "
|
||||
INSERT INTO poller_state (source, last_fetched)
|
||||
VALUES ('hg', now())
|
||||
ON CONFLICT (source) DO UPDATE SET last_fetched = now();
|
||||
"
|
||||
|
||||
echo "[hg-ingest] done. total: $total changesets"
|
||||
Reference in New Issue
Block a user