feat(hg): revset-based author query, group discovery, one-shot ingest script

Rewrites the hg worker to use json-log?rev=author() which matches the
changeset author (not the pusher), capturing commits landed by sheriffs.
Repos are discovered within configured groups plus individually listed
repos. The worker skips entirely after the first successful backfill.

Adds script/hg-ingest.sh for offline ingestion via local hg clones —
clones one repo at a time, caches extracted changesets to .tsv, inserts
via psql, and sets poller_state when done.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-05 13:45:33 +03:00
parent 1bbe55dc84
commit 88fbbba60b
4 changed files with 284 additions and 112 deletions

View File

@@ -51,21 +51,31 @@ struct Args {
#[arg(long, env = "HG_HOST", default_value = "hg-edge.mozilla.org")]
hg_host: String,
/// Comma-separated mozilla hg repo paths to scan, e.g. "build/puppet,build/tools".
/// Comma-separated repo groups to scan. Repos within each group are
/// discovered via `/{group}/?style=json`.
#[arg(
long,
env = "HG_GROUPS",
value_delimiter = ',',
default_value = "build,integration"
)]
hg_groups: Vec<String>,
/// Comma-separated individual repos to scan (e.g. `mozilla-central`).
#[arg(
long,
env = "HG_REPOS",
value_delimiter = ',',
default_value = "build/puppet,build/tools,build/buildbot-configs"
default_value = "mozilla-central"
)]
hg_repos: Vec<String>,
/// Comma-separated case-insensitive substrings matched against changeset author fields.
/// Comma-separated author substrings for `author()` revset queries.
#[arg(
long,
env = "HG_AUTHOR_TERMS",
value_delimiter = ',',
default_value = "thijssen,grenade"
default_value = "rthijssen,grenade"
)]
hg_author_terms: Vec<String>,
@@ -141,6 +151,7 @@ async fn main() -> anyhow::Result<()> {
HgConfig {
host: args.hg_host.clone(),
author_terms: args.hg_author_terms.clone(),
groups: args.hg_groups.clone(),
repos: args.hg_repos.clone(),
},
)) as Arc<dyn EventSource>;
@@ -162,7 +173,9 @@ async fn main() -> anyhow::Result<()> {
gitea_host = args.gitea_host,
gitea_user = args.gitea_user,
hg_host = args.hg_host,
hg_groups = ?args.hg_groups,
hg_repos = ?args.hg_repos,
hg_author_terms = ?args.hg_author_terms,
bugzilla_host = args.bugzilla_host,
bugzilla_email = args.bugzilla_email,
events_interval_secs = args.interval_secs,