feat(github): per-repo commit enumeration for full history backfill

Adds a new github-repo EventSource that enumerates all repos via
/user/repos and walks each repo's /commits?author= endpoint, which
has no 1000-result cap unlike the Search API. Events use the same
github-commit:{sha} ID scheme as github_search for dedup. Per-repo
poller state enables full backfill on first run, page-1-only on
subsequent polls. Weekly poll interval by default.

Closes #1

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-05 14:59:26 +03:00
parent 2da9461b44
commit a71b4e6b84
5 changed files with 350 additions and 0 deletions

View File

@@ -0,0 +1,325 @@
//! Per-repo commit enumeration for full GitHub history.
//!
//! The Search API caps at 1000 results; this source enumerates all repos
//! the user can access via `/user/repos` and walks each repo's commit
//! history via `/repos/{owner}/{repo}/commits?author={user}` — no cap.
//!
//! Events use `github-commit:{sha}` as their ID, matching the scheme in
//! `github_search`, so duplicates are resolved via idempotent upsert.
//!
//! Per-repo poller state keys (`github-repo:{owner}/{repo}`) track which
//! repos have been fully backfilled. First run paginates the full history;
//! subsequent runs fetch only page 1.
use std::sync::Arc;
use async_trait::async_trait;
use chrono::{DateTime, Utc};
use moments_core::{EventSource, EventWriter, PollerStateStore, SourceError};
use moments_entities::{Event, Source};
use reqwest::{Client, header};
use serde_json::Value;
use tracing::{debug, warn};
const SOURCE_NAME: &str = "github-repo";
const USER_AGENT: &str = concat!(
"moments/",
env!("CARGO_PKG_VERSION"),
" (+https://rob.tn)"
);
const MAX_BACKFILL_PAGES: u32 = 100;
#[derive(Clone, Debug)]
pub struct GithubRepoConfig {
pub user: String,
pub token: Option<String>,
pub per_page: u32,
}
impl Default for GithubRepoConfig {
fn default() -> Self {
Self {
user: "grenade".into(),
token: None,
per_page: 100,
}
}
}
pub struct GithubRepoSource {
client: Client,
writer: Arc<dyn EventWriter>,
state: Arc<dyn PollerStateStore>,
config: GithubRepoConfig,
}
impl GithubRepoSource {
pub fn new(
client: Client,
writer: Arc<dyn EventWriter>,
state: Arc<dyn PollerStateStore>,
config: GithubRepoConfig,
) -> Self {
Self {
client,
writer,
state,
config,
}
}
fn apply_headers(&self, mut req: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
req = req
.header(header::ACCEPT, "application/vnd.github+json")
.header("X-GitHub-Api-Version", "2022-11-28")
.header(header::USER_AGENT, USER_AGENT);
if let Some(token) = &self.config.token {
req = req.header(header::AUTHORIZATION, format!("Bearer {token}"));
}
req
}
/// Discover all repos the authenticated user can access.
async fn discover_repos(&self) -> Result<Vec<Repo>, SourceError> {
if self.config.token.is_none() {
return Ok(vec![]);
}
let mut repos = Vec::new();
for page in 1..=50 {
let url = format!(
"https://api.github.com/user/repos?affiliation=owner,collaborator,organization_member&visibility=all&per_page={}&page={}",
self.config.per_page, page
);
let req = self.apply_headers(self.client.get(&url));
let resp = req
.send()
.await
.map_err(|e| SourceError::Http(e.to_string()))?;
if !resp.status().is_success() {
return Err(SourceError::Http(format!("{} GET {}", resp.status(), url)));
}
let items: Vec<Value> = resp
.json()
.await
.map_err(|e| SourceError::Parse(e.to_string()))?;
if items.is_empty() {
break;
}
for item in &items {
if let Some(r) = parse_repo(item) {
repos.push(r);
}
}
if items.len() < self.config.per_page as usize {
break;
}
}
Ok(repos)
}
/// Fetch commits for a single repo, paginating fully on first run.
async fn scan_repo(&self, repo: &Repo) -> Result<usize, SourceError> {
let state_key = format!("github-repo:{}", repo.full_name);
let prior = self.state.load(&state_key).await?;
let first_run = prior.is_none();
let max_pages = if first_run { MAX_BACKFILL_PAGES } else { 1 };
let mut total = 0usize;
for page in 1..=max_pages {
let url = format!(
"https://api.github.com/repos/{}/commits?author={}&per_page={}&page={}",
repo.full_name, self.config.user, self.config.per_page, page
);
let req = self.apply_headers(self.client.get(&url));
let resp = req
.send()
.await
.map_err(|e| SourceError::Http(e.to_string()))?;
let status = resp.status();
// 409 = empty repo (no commits at all), not an error
if status.as_u16() == 409 {
break;
}
if status.as_u16() == 403 || status.as_u16() == 429 {
warn!(repo = %repo.full_name, status = %status, "rate limited; stopping early");
return Err(SourceError::Http(format!("{} GET {}", status, url)));
}
if status.as_u16() == 404 {
warn!(repo = %repo.full_name, "repo not found; skipping");
break;
}
if !status.is_success() {
return Err(SourceError::Http(format!("{} GET {}", status, url)));
}
let items: Vec<Value> = resp
.json()
.await
.map_err(|e| SourceError::Parse(e.to_string()))?;
if items.is_empty() {
break;
}
let events: Vec<Event> = items
.iter()
.filter_map(|item| parse_commit(item, repo))
.collect();
total += self.writer.upsert_events(&events).await?;
if items.len() < self.config.per_page as usize {
break;
}
}
self.state.touch(&state_key).await?;
Ok(total)
}
}
#[async_trait]
impl EventSource for GithubRepoSource {
fn name(&self) -> &'static str {
SOURCE_NAME
}
async fn poll(&self) -> Result<usize, SourceError> {
let repos = self.discover_repos().await?;
debug!(repos = repos.len(), "discovered github repos");
let mut total = 0usize;
for repo in &repos {
match self.scan_repo(repo).await {
Ok(n) => {
if n > 0 {
debug!(repo = %repo.full_name, ingested = n, "repo commit scan complete");
}
total += n;
}
Err(SourceError::Http(ref msg)) if msg.starts_with("403") || msg.starts_with("429") => {
warn!("rate limited during repo scan; ending poll early");
break;
}
Err(e) => {
warn!(repo = %repo.full_name, error = %e, "repo scan failed; continuing");
}
}
}
self.state.touch(SOURCE_NAME).await?;
debug!(ingested = total, repos = repos.len(), "github-repo poll complete");
Ok(total)
}
}
#[derive(Debug, Clone)]
struct Repo {
full_name: String,
private: bool,
}
fn parse_repo(item: &Value) -> Option<Repo> {
let full_name = item.get("full_name").and_then(Value::as_str)?;
let private = item.get("private").and_then(Value::as_bool).unwrap_or(false);
Some(Repo {
full_name: full_name.to_string(),
private,
})
}
fn parse_commit(item: &Value, repo: &Repo) -> Option<Event> {
let sha = item.get("sha").and_then(Value::as_str)?;
let date_str = item
.get("commit")
.and_then(|c| c.get("author"))
.and_then(|a| a.get("date"))
.and_then(Value::as_str)
.or_else(|| {
item.get("commit")
.and_then(|c| c.get("committer"))
.and_then(|c| c.get("date"))
.and_then(Value::as_str)
})?;
let occurred_at = DateTime::parse_from_rfc3339(date_str)
.ok()?
.with_timezone(&Utc);
Some(Event {
id: format!("github-commit:{sha}"),
source: Source::Github,
action: "Commit".into(),
occurred_at,
public: !repo.private,
payload: item.clone(),
})
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn parse_commit_uses_sha_as_id() {
let repo = Repo {
full_name: "grenade/moments".into(),
private: false,
};
let raw = json!({
"sha": "abc123",
"commit": {
"author": { "date": "2024-01-15T10:30:00Z" },
"message": "fix something"
}
});
let ev = parse_commit(&raw, &repo).expect("parses");
assert_eq!(ev.id, "github-commit:abc123");
assert_eq!(ev.action, "Commit");
assert!(ev.public);
}
#[test]
fn parse_commit_private_repo() {
let repo = Repo {
full_name: "grenade/secret".into(),
private: true,
};
let raw = json!({
"sha": "def456",
"commit": {
"author": { "date": "2024-01-15T10:30:00Z" },
"message": "secret change"
}
});
let ev = parse_commit(&raw, &repo).expect("parses");
assert!(!ev.public);
}
#[test]
fn parse_commit_falls_back_to_committer_date() {
let repo = Repo {
full_name: "grenade/moments".into(),
private: false,
};
let raw = json!({
"sha": "ghi789",
"commit": {
"committer": { "date": "2024-02-01T12:00:00Z" },
"message": "no author date"
}
});
let ev = parse_commit(&raw, &repo).expect("parses");
assert_eq!(ev.id, "github-commit:ghi789");
}
#[test]
fn parse_repo_extracts_fields() {
let raw = json!({
"full_name": "grenade/moments",
"private": false
});
let repo = parse_repo(&raw).expect("parses");
assert_eq!(repo.full_name, "grenade/moments");
assert!(!repo.private);
}
}

View File

@@ -1,6 +1,7 @@
pub mod bugzilla;
pub mod gitea;
pub mod github;
pub mod github_repo;
pub mod github_search;
pub mod hg;