From 7772393598e0e4232b0cb1f311ab1e3ece5c73f1 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Sun, 3 May 2026 18:54:32 +0300 Subject: [PATCH] feat(worker): add commits to github search backfill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Walk back the earlier decision to skip /search/commits. The fork inflation that worried me isn't misattribution — those commits really were authored by the user; they just persist in forks after the original repo went away. Skipping them dropped legitimate historical work from the timeline. The duplicate-SHA-across-forks issue is a pure dedup concern: * keyed `github-commit:` (SHA only — globally unique by Git's content addressing; same commit in two forks lands in one row); * within a single page, dedup by id before INSERT (postgres ON CONFLICT errors when the conflict target appears twice in one statement); * across pages and runs, last-write-wins via upsert. The repo association may flip between forks but the commit content is identical. Visibility is read inline from `repository.private` on the search item, no extra lookup needed. Also opportunistically populates the shared visibility cache so the issue loop in the same poll skips /repos/{full_name} GETs for any repo it already saw via commits. Reshape: presentation/github.rs gains a Commit path — short SHA linked, repo linked, first line of the commit message as subtitle. GitCommit icon. Tests: +3 in github_search (parse uses sha as id, marks private, rejects non-github URL), +1 in presentation (commit reshape uses short sha + first message line) — 18 total green. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../moments-core/src/presentation/github.rs | 87 ++++++++- crates/moments-data/src/github_search.rs | 182 ++++++++++++++++-- 2 files changed, 255 insertions(+), 14 deletions(-) diff --git a/crates/moments-core/src/presentation/github.rs b/crates/moments-core/src/presentation/github.rs index ec27e44..c09b9a8 100644 --- a/crates/moments-core/src/presentation/github.rs +++ b/crates/moments-core/src/presentation/github.rs @@ -6,8 +6,10 @@ use serde_json::Value; pub(crate) fn reshape(event: &Event) -> TimelineItem { // Search-API items have a different payload shape (the search item itself // rather than a wrapped event), so dispatch them through a separate path. - if matches!(event.action.as_str(), "Issue" | "PullRequest") { - return search_reshape(event); + match event.action.as_str() { + "Issue" | "PullRequest" => return search_reshape(event), + "Commit" => return commit_reshape(event), + _ => {} } let p = &event.payload; @@ -426,6 +428,58 @@ fn search_reshape(event: &Event) -> TimelineItem { } } +fn commit_reshape(event: &Event) -> TimelineItem { + let p = &event.payload; + let sha = p.get("sha").and_then(Value::as_str).unwrap_or(""); + let short_sha: String = sha.chars().take(7).collect(); + let html_url = p.get("html_url").and_then(Value::as_str).unwrap_or(""); + let message_first_line = p + .get("commit") + .and_then(|c| c.get("message")) + .and_then(Value::as_str) + .unwrap_or("") + .lines() + .next() + .unwrap_or("") + .to_string(); + let repo = p + .get("repository") + .and_then(|r| r.get("full_name")) + .and_then(Value::as_str) + .unwrap_or("(unknown repo)"); + let author_login = p + .get("author") + .and_then(|a| a.get("login")) + .and_then(Value::as_str); + + let mut title = Vec::new(); + if let Some(actor) = author_login { + title.push(TitleSegment::link( + actor.to_string(), + format!("https://github.com/{actor}"), + )); + title.push(TitleSegment::text(" ")); + } + title.push(TitleSegment::text("committed ")); + title.push(TitleSegment::link(short_sha, html_url.to_string())); + title.push(TitleSegment::text(" in ")); + title.push(repo_link(repo)); + + let subtitle = (!message_first_line.is_empty()) + .then(|| vec![TitleSegment::text(message_first_line)]); + + TimelineItem { + id: event.id.clone(), + source: Source::Github, + action: event.action.clone(), + occurred_at: event.occurred_at, + icon: TimelineIcon::GitCommit, + title, + subtitle, + body: None, + } +} + fn repo_from_url(url: &str) -> Option { let stripped = url.strip_prefix("https://github.com/")?; let mut parts = stripped.splitn(3, '/'); @@ -600,6 +654,35 @@ mod tests { ); } + #[test] + fn commit_reshape_uses_short_sha_and_first_message_line() { + let raw = json!({ + "sha": "a6fcefbe909a97ad5a049b9fa48bc74309af10d9", + "html_url": "https://github.com/faith1337z/Trade/commit/a6fcefbe909a97ad5a049b9fa48bc74309af10d9", + "commit": { + "message": "split multiline message into multiple irc messages\n\nbody body body" + }, + "repository": { "full_name": "faith1337z/Trade" }, + "author": { "login": "grenade" } + }); + let item = reshape(&ev("Commit", raw)); + assert_eq!(item.icon, TimelineIcon::GitCommit); + let rendered: String = item + .title + .iter() + .map(|s| match s { + TitleSegment::Text { text } => text.clone(), + TitleSegment::Link { text, .. } => text.clone(), + }) + .collect(); + assert!(rendered.contains("committed a6fcefb in faith1337z/Trade"), "got: {rendered}"); + // body of the commit message is dropped; only first line in subtitle + assert_eq!( + item.subtitle.unwrap(), + vec![TitleSegment::text("split multiline message into multiple irc messages")] + ); + } + #[test] fn unknown_event_falls_back() { let raw = json!({ diff --git a/crates/moments-data/src/github_search.rs b/crates/moments-data/src/github_search.rs index 4abb93c..dc6a43b 100644 --- a/crates/moments-data/src/github_search.rs +++ b/crates/moments-data/src/github_search.rs @@ -1,16 +1,20 @@ //! GitHub Search API ingestion for historical backfill. //! -//! The Events API caps at 90 days; this source uses `/search/issues` with -//! `author:` to recover issues and PRs going back as far as GitHub -//! retains them (1000-result ceiling per the Search API's hard cap). +//! The Events API caps at 90 days; this source uses `/search/issues` and +//! `/search/commits` with `author:` to recover issues, PRs, and +//! commits going back as far as GitHub retains them (1000-result ceiling +//! per query is the Search API's hard cap). //! -//! `/search/commits` is deliberately not used: GitHub matches the same commit -//! across every fork that contains it, inflating result counts and surfacing -//! commits in repos the user never authored to. If commit history becomes -//! desirable we should enumerate the user's repos and walk per-repo -//! `/repos/{o}/{r}/commits?author=...` instead. +//! Fork duplication on /search/commits — the same commit SHA appears in +//! every fork that still contains it — is handled by: +//! * deduplicating by `id = github-commit:` within each batch +//! before upsert (postgres ON CONFLICT errors if the same conflict +//! target appears twice in one INSERT); +//! * upserting with last-write-wins across batches and runs (the SHA +//! is the same; the repo association may flip between forks but the +//! commit itself is identical). -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use async_trait::async_trait; @@ -103,6 +107,68 @@ impl GithubSearchSource { Ok(v.get("private").and_then(Value::as_bool).unwrap_or(false)) } + async fn search_commits( + &self, + vis_cache: &mut HashMap, + ) -> Result { + let mut total = 0usize; + for page in 1..=self.config.max_pages { + let url = format!( + "https://api.github.com/search/commits?q=author:{}&sort=author-date&order=desc&per_page={}&page={}", + self.config.user, self.config.per_page, page + ); + let req = self.apply_headers(self.client.get(&url)); + let resp = req + .send() + .await + .map_err(|e| SourceError::Http(e.to_string()))?; + if !resp.status().is_success() { + return Err(SourceError::Http(format!("{} GET {}", resp.status(), url))); + } + let body: Value = resp + .json() + .await + .map_err(|e| SourceError::Parse(e.to_string()))?; + let items = body + .get("items") + .and_then(Value::as_array) + .cloned() + .unwrap_or_default(); + if items.is_empty() { + break; + } + + // Dedup within the page by id (same commit in multiple forks + // returns multiple search items with the same SHA — postgres + // refuses ON CONFLICT when the conflict target appears twice + // in one INSERT). + let mut seen: HashSet = HashSet::new(); + let mut events = Vec::with_capacity(items.len()); + for item in &items { + if let Some(ev) = parse_commit_event(item) { + if seen.insert(ev.id.clone()) { + // Opportunistically populate the visibility cache so + // search_issues can reuse it for the same repos. + if let Some(repo) = item + .get("repository") + .and_then(|r| r.get("full_name")) + .and_then(Value::as_str) + { + vis_cache.insert(repo.to_string(), !ev.public); + } + events.push(ev); + } + } + } + total += self.writer.upsert_events(&events).await?; + + if items.len() < self.config.per_page as usize { + break; + } + } + Ok(total) + } + async fn search_issues( &self, vis_cache: &mut HashMap, @@ -203,17 +269,65 @@ impl EventSource for GithubSearchSource { async fn poll(&self) -> Result { let mut vis_cache: HashMap = HashMap::new(); - let total = self.search_issues(&mut vis_cache).await?; + // Run commits first so vis_cache is partly seeded with inline-flag + // visibility before the issue loop hits its (more expensive) per-repo + // lookups. + let commits = self.search_commits(&mut vis_cache).await?; + let issues = self.search_issues(&mut vis_cache).await?; self.state.touch(SOURCE_NAME).await?; debug!( - ingested = total, + commits, + issues, unique_repos = vis_cache.len(), "github-search poll complete" ); - Ok(total) + Ok(commits + issues) } } +/// Convert a /search/commits item into our Event row. Returns None if the +/// item is missing required fields. +fn parse_commit_event(item: &Value) -> Option { + let sha = item.get("sha").and_then(Value::as_str)?; + let html_url = item.get("html_url").and_then(Value::as_str)?; + // Prefer author.date — it's when the work was written; committer.date + // can shift on rebase. Either is RFC3339. + let date_str = item + .get("commit") + .and_then(|c| c.get("author")) + .and_then(|a| a.get("date")) + .and_then(Value::as_str) + .or_else(|| { + item.get("commit") + .and_then(|c| c.get("committer")) + .and_then(|c| c.get("date")) + .and_then(Value::as_str) + })?; + let occurred_at = DateTime::parse_from_rfc3339(date_str) + .ok()? + .with_timezone(&Utc); + let private = item + .get("repository") + .and_then(|r| r.get("private")) + .and_then(Value::as_bool) + .unwrap_or(false); + + // Sanity-check the html_url points at github.com so we don't ingest + // garbage if GitHub ever changes its URL shape. + if !html_url.starts_with("https://github.com/") { + return None; + } + + Some(Event { + id: format!("github-commit:{sha}"), + source: Source::Github, + action: "Commit".into(), + occurred_at, + public: !private, + payload: item.clone(), + }) +} + /// Extract `owner/repo` from a github.com URL like /// `https://github.com/owner/repo/{issues,pull}/42`. fn repo_from_html_url(url: &str) -> Option { @@ -247,4 +361,48 @@ mod tests { fn rejects_non_github_host() { assert!(repo_from_html_url("https://gitlab.com/x/y/-/issues/1").is_none()); } + + #[test] + fn parse_commit_uses_sha_as_id() { + let raw = serde_json::json!({ + "sha": "a6fcefbe909a97ad5a049b9fa48bc74309af10d9", + "html_url": "https://github.com/faith1337z/Trade/commit/a6fcefbe909a97ad5a049b9fa48bc74309af10d9", + "commit": { + "author": { "name": "rob", "date": "2017-11-13T23:32:31+02:00" }, + "committer": { "name": "rob", "date": "2017-11-13T22:32:31+01:00" }, + "message": "split multiline message into multiple irc messages" + }, + "repository": { "full_name": "faith1337z/Trade", "private": false } + }); + let ev = parse_commit_event(&raw).expect("parses"); + assert_eq!(ev.id, "github-commit:a6fcefbe909a97ad5a049b9fa48bc74309af10d9"); + assert_eq!(ev.action, "Commit"); + assert!(ev.public); + } + + #[test] + fn parse_commit_marks_private_repo() { + let raw = serde_json::json!({ + "sha": "deadbeef", + "html_url": "https://github.com/grenade/private-repo/commit/deadbeef", + "commit": { + "author": { "date": "2024-01-01T00:00:00Z" }, + "message": "x" + }, + "repository": { "full_name": "grenade/private-repo", "private": true } + }); + let ev = parse_commit_event(&raw).expect("parses"); + assert!(!ev.public); + } + + #[test] + fn parse_commit_rejects_non_github_url() { + let raw = serde_json::json!({ + "sha": "abc", + "html_url": "https://example.com/x/commit/abc", + "commit": { "author": { "date": "2024-01-01T00:00:00Z" } }, + "repository": { "full_name": "x/y", "private": false } + }); + assert!(parse_commit_event(&raw).is_none()); + } }