#!/usr/bin/env bash # # One-shot hg changeset ingestion via local clones. # # Bare-clones each hg repo, extracts changesets matching author terms, # and inserts them into the moments database. Sets poller_state so the # worker won't re-scan. # # Requirements: hg (mercurial), psql, jq # # Usage: # DATABASE_URL="postgres://..." ./script/hg-ingest.sh # set -euo pipefail DATABASE_URL="${DATABASE_URL:-postgres://moments_rw@magrathea.kosherinata.internal:5432/moments?sslmode=verify-full&sslrootcert=/etc/pki/ca-trust/source/anchors/root-internal.pem&sslcert=/etc/pki/tls/misc/$(hostname -f).pem&sslkey=/etc/pki/tls/private/$(hostname -f).pem}" HG_HOST="${HG_HOST:-hg-edge.mozilla.org}" WORK_DIR="${HG_WORK_DIR:-$HOME/hg}" # Repos to clone (groups are expanded inline) REPOS=( integration/mozilla-inbound integration/autoland integration/fx-team integration/b2g-inbound build/puppet build/tools build/buildbot build/buildbot-configs build/slave_health build/mozharness build/braindump build/cloud-tools build/compare-locales build/nagios-core build/partner-repacks build/preproduction build/rpm-sources build/talos build/tupperware build/ash-mozharness build/autoland build/opsi-package-sources ) # Author terms — matched case-insensitively against changeset author fields AUTHOR_TERMS=("rthijssen" "grenade") : "${DATABASE_URL:?DATABASE_URL must be set}" mkdir -p "$WORK_DIR" total=0 CLONE_DIR="$WORK_DIR/clone" CACHE_DIR="$WORK_DIR/cache" mkdir -p "$CACHE_DIR" cd "$WORK_DIR" for repo in "${REPOS[@]}"; do cache_file="$CACHE_DIR/$(echo "$repo" | tr '/' '_').tsv" # Skip repos already cached (re-run safe) if [ -f "$cache_file" ]; then echo "[hg-ingest] $repo: using cached results" else # Remove any previous clone to keep only one on disk rm -rf "$CLONE_DIR" echo "[hg-ingest] cloning $repo" if ! hg clone --noupdate "https://$HG_HOST/$repo" "$CLONE_DIR"; then echo "[hg-ingest] clone failed: $repo (skipping)" continue fi # Build revset: author(term1) or author(term2) ... revset="" for term in "${AUTHOR_TERMS[@]}"; do if [ -z "$revset" ]; then revset="author('$term')" else revset="$revset or author('$term')" fi done # Extract matching changesets to cache file hg log -R "$CLONE_DIR" -r "$revset" \ --template '{node}\t{author}\t{date|hgdate}\t{desc|firstline}\n' \ > "$cache_file" || true # Free disk immediately rm -rf "$CLONE_DIR" fi # Ingest cached results into the database count=0 while IFS=$'\t' read -r node author date_raw desc; do [ -z "$node" ] && continue # {date|hgdate} outputs "timestamp offset" — take just the timestamp date_ts="${date_raw%% *}" # Build ISO timestamp from unix epoch occurred_at=$(date -u -d "@${date_ts}" '+%Y-%m-%dT%H:%M:%SZ') event_id="hg:${repo}:${node}" # Build payload JSON (jq handles all escaping) payload=$(jq -n \ --arg node "$node" \ --arg user "$author" \ --arg desc "$desc" \ --arg repo "$repo" \ --arg host "$HG_HOST" \ '{node: $node, user: $user, desc: $desc, _repo: $repo, _host: $host}') # Upsert into events table psql "$DATABASE_URL" -q -c " INSERT INTO events (id, source, action, occurred_at, public, payload) VALUES (\$\$${event_id}\$\$, 'hg', 'Commit', '${occurred_at}', true, \$\$${payload}\$\$::jsonb) ON CONFLICT (id) DO NOTHING; " count=$((count + 1)) done < "$cache_file" if [ "$count" -gt 0 ]; then echo "[hg-ingest] $repo: $count changesets ingested" fi total=$((total + count)) done # Mark poller state so the worker skips hg psql "$DATABASE_URL" -q -c " INSERT INTO poller_state (source, last_fetched) VALUES ('hg', now()) ON CONFLICT (source) DO UPDATE SET last_fetched = now(); " echo "[hg-ingest] done. total: $total changesets"