feat(worker): add commits to github search backfill

Walk back the earlier decision to skip /search/commits. The fork
inflation that worried me isn't misattribution — those commits
really were authored by the user; they just persist in forks after
the original repo went away. Skipping them dropped legitimate
historical work from the timeline.

The duplicate-SHA-across-forks issue is a pure dedup concern:
  * keyed `github-commit:<sha>` (SHA only — globally unique by Git's
    content addressing; same commit in two forks lands in one row);
  * within a single page, dedup by id before INSERT (postgres ON
    CONFLICT errors when the conflict target appears twice in one
    statement);
  * across pages and runs, last-write-wins via upsert. The repo
    association may flip between forks but the commit content is
    identical.

Visibility is read inline from `repository.private` on the search
item, no extra lookup needed. Also opportunistically populates the
shared visibility cache so the issue loop in the same poll skips
/repos/{full_name} GETs for any repo it already saw via commits.

Reshape: presentation/github.rs gains a Commit path — short SHA
linked, repo linked, first line of the commit message as subtitle.
GitCommit icon.

Tests: +3 in github_search (parse uses sha as id, marks private,
rejects non-github URL), +1 in presentation (commit reshape uses
short sha + first message line) — 18 total green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-03 18:54:32 +03:00
parent e4052c4c9a
commit 7772393598
2 changed files with 255 additions and 14 deletions

View File

@@ -6,8 +6,10 @@ use serde_json::Value;
pub(crate) fn reshape(event: &Event) -> TimelineItem { pub(crate) fn reshape(event: &Event) -> TimelineItem {
// Search-API items have a different payload shape (the search item itself // Search-API items have a different payload shape (the search item itself
// rather than a wrapped event), so dispatch them through a separate path. // rather than a wrapped event), so dispatch them through a separate path.
if matches!(event.action.as_str(), "Issue" | "PullRequest") { match event.action.as_str() {
return search_reshape(event); "Issue" | "PullRequest" => return search_reshape(event),
"Commit" => return commit_reshape(event),
_ => {}
} }
let p = &event.payload; let p = &event.payload;
@@ -426,6 +428,58 @@ fn search_reshape(event: &Event) -> TimelineItem {
} }
} }
fn commit_reshape(event: &Event) -> TimelineItem {
let p = &event.payload;
let sha = p.get("sha").and_then(Value::as_str).unwrap_or("");
let short_sha: String = sha.chars().take(7).collect();
let html_url = p.get("html_url").and_then(Value::as_str).unwrap_or("");
let message_first_line = p
.get("commit")
.and_then(|c| c.get("message"))
.and_then(Value::as_str)
.unwrap_or("")
.lines()
.next()
.unwrap_or("")
.to_string();
let repo = p
.get("repository")
.and_then(|r| r.get("full_name"))
.and_then(Value::as_str)
.unwrap_or("(unknown repo)");
let author_login = p
.get("author")
.and_then(|a| a.get("login"))
.and_then(Value::as_str);
let mut title = Vec::new();
if let Some(actor) = author_login {
title.push(TitleSegment::link(
actor.to_string(),
format!("https://github.com/{actor}"),
));
title.push(TitleSegment::text(" "));
}
title.push(TitleSegment::text("committed "));
title.push(TitleSegment::link(short_sha, html_url.to_string()));
title.push(TitleSegment::text(" in "));
title.push(repo_link(repo));
let subtitle = (!message_first_line.is_empty())
.then(|| vec![TitleSegment::text(message_first_line)]);
TimelineItem {
id: event.id.clone(),
source: Source::Github,
action: event.action.clone(),
occurred_at: event.occurred_at,
icon: TimelineIcon::GitCommit,
title,
subtitle,
body: None,
}
}
fn repo_from_url(url: &str) -> Option<String> { fn repo_from_url(url: &str) -> Option<String> {
let stripped = url.strip_prefix("https://github.com/")?; let stripped = url.strip_prefix("https://github.com/")?;
let mut parts = stripped.splitn(3, '/'); let mut parts = stripped.splitn(3, '/');
@@ -600,6 +654,35 @@ mod tests {
); );
} }
#[test]
fn commit_reshape_uses_short_sha_and_first_message_line() {
let raw = json!({
"sha": "a6fcefbe909a97ad5a049b9fa48bc74309af10d9",
"html_url": "https://github.com/faith1337z/Trade/commit/a6fcefbe909a97ad5a049b9fa48bc74309af10d9",
"commit": {
"message": "split multiline message into multiple irc messages\n\nbody body body"
},
"repository": { "full_name": "faith1337z/Trade" },
"author": { "login": "grenade" }
});
let item = reshape(&ev("Commit", raw));
assert_eq!(item.icon, TimelineIcon::GitCommit);
let rendered: String = item
.title
.iter()
.map(|s| match s {
TitleSegment::Text { text } => text.clone(),
TitleSegment::Link { text, .. } => text.clone(),
})
.collect();
assert!(rendered.contains("committed a6fcefb in faith1337z/Trade"), "got: {rendered}");
// body of the commit message is dropped; only first line in subtitle
assert_eq!(
item.subtitle.unwrap(),
vec![TitleSegment::text("split multiline message into multiple irc messages")]
);
}
#[test] #[test]
fn unknown_event_falls_back() { fn unknown_event_falls_back() {
let raw = json!({ let raw = json!({

View File

@@ -1,16 +1,20 @@
//! GitHub Search API ingestion for historical backfill. //! GitHub Search API ingestion for historical backfill.
//! //!
//! The Events API caps at 90 days; this source uses `/search/issues` with //! The Events API caps at 90 days; this source uses `/search/issues` and
//! `author:<user>` to recover issues and PRs going back as far as GitHub //! `/search/commits` with `author:<user>` to recover issues, PRs, and
//! retains them (1000-result ceiling per the Search API's hard cap). //! commits going back as far as GitHub retains them (1000-result ceiling
//! per query is the Search API's hard cap).
//! //!
//! `/search/commits` is deliberately not used: GitHub matches the same commit //! Fork duplication on /search/commits — the same commit SHA appears in
//! across every fork that contains it, inflating result counts and surfacing //! every fork that still contains it — is handled by:
//! commits in repos the user never authored to. If commit history becomes //! * deduplicating by `id = github-commit:<sha>` within each batch
//! desirable we should enumerate the user's repos and walk per-repo //! before upsert (postgres ON CONFLICT errors if the same conflict
//! `/repos/{o}/{r}/commits?author=...` instead. //! target appears twice in one INSERT);
//! * upserting with last-write-wins across batches and runs (the SHA
//! is the same; the repo association may flip between forks but the
//! commit itself is identical).
use std::collections::HashMap; use std::collections::{HashMap, HashSet};
use std::sync::Arc; use std::sync::Arc;
use async_trait::async_trait; use async_trait::async_trait;
@@ -103,6 +107,68 @@ impl GithubSearchSource {
Ok(v.get("private").and_then(Value::as_bool).unwrap_or(false)) Ok(v.get("private").and_then(Value::as_bool).unwrap_or(false))
} }
async fn search_commits(
&self,
vis_cache: &mut HashMap<String, bool>,
) -> Result<usize, SourceError> {
let mut total = 0usize;
for page in 1..=self.config.max_pages {
let url = format!(
"https://api.github.com/search/commits?q=author:{}&sort=author-date&order=desc&per_page={}&page={}",
self.config.user, self.config.per_page, page
);
let req = self.apply_headers(self.client.get(&url));
let resp = req
.send()
.await
.map_err(|e| SourceError::Http(e.to_string()))?;
if !resp.status().is_success() {
return Err(SourceError::Http(format!("{} GET {}", resp.status(), url)));
}
let body: Value = resp
.json()
.await
.map_err(|e| SourceError::Parse(e.to_string()))?;
let items = body
.get("items")
.and_then(Value::as_array)
.cloned()
.unwrap_or_default();
if items.is_empty() {
break;
}
// Dedup within the page by id (same commit in multiple forks
// returns multiple search items with the same SHA — postgres
// refuses ON CONFLICT when the conflict target appears twice
// in one INSERT).
let mut seen: HashSet<String> = HashSet::new();
let mut events = Vec::with_capacity(items.len());
for item in &items {
if let Some(ev) = parse_commit_event(item) {
if seen.insert(ev.id.clone()) {
// Opportunistically populate the visibility cache so
// search_issues can reuse it for the same repos.
if let Some(repo) = item
.get("repository")
.and_then(|r| r.get("full_name"))
.and_then(Value::as_str)
{
vis_cache.insert(repo.to_string(), !ev.public);
}
events.push(ev);
}
}
}
total += self.writer.upsert_events(&events).await?;
if items.len() < self.config.per_page as usize {
break;
}
}
Ok(total)
}
async fn search_issues( async fn search_issues(
&self, &self,
vis_cache: &mut HashMap<String, bool>, vis_cache: &mut HashMap<String, bool>,
@@ -203,17 +269,65 @@ impl EventSource for GithubSearchSource {
async fn poll(&self) -> Result<usize, SourceError> { async fn poll(&self) -> Result<usize, SourceError> {
let mut vis_cache: HashMap<String, bool> = HashMap::new(); let mut vis_cache: HashMap<String, bool> = HashMap::new();
let total = self.search_issues(&mut vis_cache).await?; // Run commits first so vis_cache is partly seeded with inline-flag
// visibility before the issue loop hits its (more expensive) per-repo
// lookups.
let commits = self.search_commits(&mut vis_cache).await?;
let issues = self.search_issues(&mut vis_cache).await?;
self.state.touch(SOURCE_NAME).await?; self.state.touch(SOURCE_NAME).await?;
debug!( debug!(
ingested = total, commits,
issues,
unique_repos = vis_cache.len(), unique_repos = vis_cache.len(),
"github-search poll complete" "github-search poll complete"
); );
Ok(total) Ok(commits + issues)
} }
} }
/// Convert a /search/commits item into our Event row. Returns None if the
/// item is missing required fields.
fn parse_commit_event(item: &Value) -> Option<Event> {
let sha = item.get("sha").and_then(Value::as_str)?;
let html_url = item.get("html_url").and_then(Value::as_str)?;
// Prefer author.date — it's when the work was written; committer.date
// can shift on rebase. Either is RFC3339.
let date_str = item
.get("commit")
.and_then(|c| c.get("author"))
.and_then(|a| a.get("date"))
.and_then(Value::as_str)
.or_else(|| {
item.get("commit")
.and_then(|c| c.get("committer"))
.and_then(|c| c.get("date"))
.and_then(Value::as_str)
})?;
let occurred_at = DateTime::parse_from_rfc3339(date_str)
.ok()?
.with_timezone(&Utc);
let private = item
.get("repository")
.and_then(|r| r.get("private"))
.and_then(Value::as_bool)
.unwrap_or(false);
// Sanity-check the html_url points at github.com so we don't ingest
// garbage if GitHub ever changes its URL shape.
if !html_url.starts_with("https://github.com/") {
return None;
}
Some(Event {
id: format!("github-commit:{sha}"),
source: Source::Github,
action: "Commit".into(),
occurred_at,
public: !private,
payload: item.clone(),
})
}
/// Extract `owner/repo` from a github.com URL like /// Extract `owner/repo` from a github.com URL like
/// `https://github.com/owner/repo/{issues,pull}/42`. /// `https://github.com/owner/repo/{issues,pull}/42`.
fn repo_from_html_url(url: &str) -> Option<String> { fn repo_from_html_url(url: &str) -> Option<String> {
@@ -247,4 +361,48 @@ mod tests {
fn rejects_non_github_host() { fn rejects_non_github_host() {
assert!(repo_from_html_url("https://gitlab.com/x/y/-/issues/1").is_none()); assert!(repo_from_html_url("https://gitlab.com/x/y/-/issues/1").is_none());
} }
#[test]
fn parse_commit_uses_sha_as_id() {
let raw = serde_json::json!({
"sha": "a6fcefbe909a97ad5a049b9fa48bc74309af10d9",
"html_url": "https://github.com/faith1337z/Trade/commit/a6fcefbe909a97ad5a049b9fa48bc74309af10d9",
"commit": {
"author": { "name": "rob", "date": "2017-11-13T23:32:31+02:00" },
"committer": { "name": "rob", "date": "2017-11-13T22:32:31+01:00" },
"message": "split multiline message into multiple irc messages"
},
"repository": { "full_name": "faith1337z/Trade", "private": false }
});
let ev = parse_commit_event(&raw).expect("parses");
assert_eq!(ev.id, "github-commit:a6fcefbe909a97ad5a049b9fa48bc74309af10d9");
assert_eq!(ev.action, "Commit");
assert!(ev.public);
}
#[test]
fn parse_commit_marks_private_repo() {
let raw = serde_json::json!({
"sha": "deadbeef",
"html_url": "https://github.com/grenade/private-repo/commit/deadbeef",
"commit": {
"author": { "date": "2024-01-01T00:00:00Z" },
"message": "x"
},
"repository": { "full_name": "grenade/private-repo", "private": true }
});
let ev = parse_commit_event(&raw).expect("parses");
assert!(!ev.public);
}
#[test]
fn parse_commit_rejects_non_github_url() {
let raw = serde_json::json!({
"sha": "abc",
"html_url": "https://example.com/x/commit/abc",
"commit": { "author": { "date": "2024-01-01T00:00:00Z" } },
"repository": { "full_name": "x/y", "private": false }
});
assert!(parse_commit_event(&raw).is_none());
}
} }