feat(github): per-repo commit enumeration for full history backfill

Adds a new github-repo EventSource that enumerates all repos via
/user/repos and walks each repo's /commits?author= endpoint, which
has no 1000-result cap unlike the Search API. Events use the same
github-commit:{sha} ID scheme as github_search for dedup. Per-repo
poller state enables full backfill on first run, page-1-only on
subsequent polls. Weekly poll interval by default.

Closes #1

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-05 14:59:26 +03:00
parent 2da9461b44
commit a71b4e6b84
5 changed files with 350 additions and 0 deletions

View File

@@ -7,6 +7,7 @@ use moments_data::{
bugzilla::{BugzillaConfig, BugzillaSource},
gitea::{GiteaConfig, GiteaSource},
github::{GithubConfig, GithubSource},
github_repo::{GithubRepoConfig, GithubRepoSource},
github_search::{GithubSearchConfig, GithubSearchSource},
hg::{HgConfig, HgSource},
};
@@ -35,6 +36,11 @@ struct Args {
#[arg(long, env = "SEARCH_POLL_INTERVAL_SECS", default_value = "86400")]
search_interval_secs: u64,
/// Seconds between per-repo commit enumeration polls (full history backfill).
/// Defaults to weekly — expensive initial scan, cheap afterwards.
#[arg(long, env = "REPO_POLL_INTERVAL_SECS", default_value = "604800")]
repo_interval_secs: u64,
#[arg(long, env = "GITEA_HOST", default_value = "git.lair.cafe")]
gitea_host: String,
@@ -132,6 +138,17 @@ async fn main() -> anyhow::Result<()> {
},
)) as Arc<dyn EventSource>;
let github_repo = Arc::new(GithubRepoSource::new(
http.clone(),
store.clone(),
store.clone(),
GithubRepoConfig {
user: args.github_user.clone(),
token: args.github_token.clone(),
..Default::default()
},
)) as Arc<dyn EventSource>;
let gitea = Arc::new(GiteaSource::new(
http.clone(),
store.clone(),
@@ -180,6 +197,7 @@ async fn main() -> anyhow::Result<()> {
bugzilla_email = args.bugzilla_email,
events_interval_secs = args.interval_secs,
search_interval_secs = args.search_interval_secs,
repo_interval_secs = args.repo_interval_secs,
gitea_interval_secs = args.gitea_interval_secs,
hg_interval_secs = args.hg_interval_secs,
bugzilla_interval_secs = args.bugzilla_interval_secs,
@@ -188,6 +206,7 @@ async fn main() -> anyhow::Result<()> {
let interval = Duration::from_secs(args.interval_secs);
let search_interval = Duration::from_secs(args.search_interval_secs);
let repo_interval = Duration::from_secs(args.repo_interval_secs);
let gitea_interval = Duration::from_secs(args.gitea_interval_secs);
let hg_interval = Duration::from_secs(args.hg_interval_secs);
let bugzilla_interval = Duration::from_secs(args.bugzilla_interval_secs);
@@ -195,6 +214,8 @@ async fn main() -> anyhow::Result<()> {
let github_task = tokio::spawn(async move { run_poller(github, interval).await });
let github_search_task =
tokio::spawn(async move { run_poller(github_search, search_interval).await });
let github_repo_task =
tokio::spawn(async move { run_poller(github_repo, repo_interval).await });
let gitea_task = tokio::spawn(async move { run_poller(gitea, gitea_interval).await });
let hg_task = tokio::spawn(async move { run_poller(hg, hg_interval).await });
let bugzilla_task =
@@ -204,6 +225,7 @@ async fn main() -> anyhow::Result<()> {
info!("shutdown signal received");
github_task.abort();
github_search_task.abort();
github_repo_task.abort();
gitea_task.abort();
hg_task.abort();
bugzilla_task.abort();