Files
moments/script/hg-ingest.sh
rob thijssen a71b4e6b84 feat(github): per-repo commit enumeration for full history backfill
Adds a new github-repo EventSource that enumerates all repos via
/user/repos and walks each repo's /commits?author= endpoint, which
has no 1000-result cap unlike the Search API. Events use the same
github-commit:{sha} ID scheme as github_search for dedup. Per-repo
poller state enables full backfill on first run, page-1-only on
subsequent polls. Weekly poll interval by default.

Closes #1

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-05 14:59:26 +03:00

142 lines
3.9 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# One-shot hg changeset ingestion via local clones.
#
# Bare-clones each hg repo, extracts changesets matching author terms,
# and inserts them into the moments database. Sets poller_state so the
# worker won't re-scan.
#
# Requirements: hg (mercurial), psql, jq
#
# Usage:
# DATABASE_URL="postgres://..." ./script/hg-ingest.sh
#
set -euo pipefail
DATABASE_URL="${DATABASE_URL:-postgres://moments_rw@magrathea.kosherinata.internal:5432/moments?sslmode=verify-full&sslrootcert=/etc/pki/ca-trust/source/anchors/root-internal.pem&sslcert=/etc/pki/tls/misc/$(hostname -f).pem&sslkey=/etc/pki/tls/private/$(hostname -f).pem}"
HG_HOST="${HG_HOST:-hg-edge.mozilla.org}"
WORK_DIR="${HG_WORK_DIR:-$HOME/hg}"
# Repos to clone (groups are expanded inline)
REPOS=(
mozilla-central
integration/mozilla-inbound
integration/autoland
integration/fx-team
integration/b2g-inbound
build/puppet
build/tools
build/buildbot
build/buildbot-configs
build/slave_health
build/mozharness
build/braindump
build/cloud-tools
build/compare-locales
build/nagios-core
build/partner-repacks
build/preproduction
build/rpm-sources
build/talos
build/tupperware
build/ash-mozharness
build/autoland
build/opsi-package-sources
)
# Author terms — matched case-insensitively against changeset author fields
AUTHOR_TERMS=("rthijssen" "grenade")
: "${DATABASE_URL:?DATABASE_URL must be set}"
mkdir -p "$WORK_DIR"
total=0
CLONE_DIR="$WORK_DIR/clone"
CACHE_DIR="$WORK_DIR/cache"
mkdir -p "$CACHE_DIR"
cd "$WORK_DIR"
for repo in "${REPOS[@]}"; do
cache_file="$CACHE_DIR/$(echo "$repo" | tr '/' '_').tsv"
# Skip repos already cached (re-run safe)
if [ -f "$cache_file" ]; then
echo "[hg-ingest] $repo: using cached results"
else
# Remove any previous clone to keep only one on disk
rm -rf "$CLONE_DIR"
echo "[hg-ingest] cloning $repo"
if ! hg clone --noupdate "https://$HG_HOST/$repo" "$CLONE_DIR"; then
echo "[hg-ingest] clone failed: $repo (skipping)"
continue
fi
# Build revset: author(term1) or author(term2) ...
revset=""
for term in "${AUTHOR_TERMS[@]}"; do
if [ -z "$revset" ]; then
revset="author('$term')"
else
revset="$revset or author('$term')"
fi
done
# Extract matching changesets to cache file
hg log -R "$CLONE_DIR" -r "$revset" \
--template '{node}\t{author}\t{date|hgdate}\t{desc|firstline}\n' \
> "$cache_file" || true
# Free disk immediately
rm -rf "$CLONE_DIR"
fi
# Ingest cached results into the database
count=0
while IFS=$'\t' read -r node author date_raw desc; do
[ -z "$node" ] && continue
# {date|hgdate} outputs "timestamp offset" — take just the timestamp
date_ts="${date_raw%% *}"
# Build ISO timestamp from unix epoch
occurred_at=$(date -u -d "@${date_ts}" '+%Y-%m-%dT%H:%M:%SZ')
event_id="hg:${repo}:${node}"
# Build payload JSON (jq handles all escaping)
payload=$(jq -n \
--arg node "$node" \
--arg user "$author" \
--arg desc "$desc" \
--arg repo "$repo" \
--arg host "$HG_HOST" \
'{node: $node, user: $user, desc: $desc, _repo: $repo, _host: $host}')
# Upsert into events table
psql "$DATABASE_URL" -q -c "
INSERT INTO events (id, source, action, occurred_at, public, payload)
VALUES (\$\$${event_id}\$\$, 'hg', 'Commit', '${occurred_at}', true, \$\$${payload}\$\$::jsonb)
ON CONFLICT (id) DO NOTHING;
"
count=$((count + 1))
done < "$cache_file"
if [ "$count" -gt 0 ]; then
echo "[hg-ingest] $repo: $count changesets ingested"
fi
total=$((total + count))
done
# Mark poller state so the worker skips hg
psql "$DATABASE_URL" -q -c "
INSERT INTO poller_state (source, last_fetched)
VALUES ('hg', now())
ON CONFLICT (source) DO UPDATE SET last_fetched = now();
"
echo "[hg-ingest] done. total: $total changesets"