Walk back the earlier decision to skip /search/commits. The fork
inflation that worried me isn't misattribution — those commits
really were authored by the user; they just persist in forks after
the original repo went away. Skipping them dropped legitimate
historical work from the timeline.
The duplicate-SHA-across-forks issue is a pure dedup concern:
* keyed `github-commit:<sha>` (SHA only — globally unique by Git's
content addressing; same commit in two forks lands in one row);
* within a single page, dedup by id before INSERT (postgres ON
CONFLICT errors when the conflict target appears twice in one
statement);
* across pages and runs, last-write-wins via upsert. The repo
association may flip between forks but the commit content is
identical.
Visibility is read inline from `repository.private` on the search
item, no extra lookup needed. Also opportunistically populates the
shared visibility cache so the issue loop in the same poll skips
/repos/{full_name} GETs for any repo it already saw via commits.
Reshape: presentation/github.rs gains a Commit path — short SHA
linked, repo linked, first line of the commit message as subtitle.
GitCommit icon.
Tests: +3 in github_search (parse uses sha as id, marks private,
rejects non-github URL), +1 in presentation (commit reshape uses
short sha + first message line) — 18 total green.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
409 lines
14 KiB
Rust
409 lines
14 KiB
Rust
//! GitHub Search API ingestion for historical backfill.
|
||
//!
|
||
//! The Events API caps at 90 days; this source uses `/search/issues` and
|
||
//! `/search/commits` with `author:<user>` to recover issues, PRs, and
|
||
//! commits going back as far as GitHub retains them (1000-result ceiling
|
||
//! per query is the Search API's hard cap).
|
||
//!
|
||
//! Fork duplication on /search/commits — the same commit SHA appears in
|
||
//! every fork that still contains it — is handled by:
|
||
//! * deduplicating by `id = github-commit:<sha>` within each batch
|
||
//! before upsert (postgres ON CONFLICT errors if the same conflict
|
||
//! target appears twice in one INSERT);
|
||
//! * upserting with last-write-wins across batches and runs (the SHA
|
||
//! is the same; the repo association may flip between forks but the
|
||
//! commit itself is identical).
|
||
|
||
use std::collections::{HashMap, HashSet};
|
||
use std::sync::Arc;
|
||
|
||
use async_trait::async_trait;
|
||
use chrono::{DateTime, Utc};
|
||
use moments_core::{EventSource, EventWriter, PollerStateStore, SourceError};
|
||
use moments_entities::{Event, Source};
|
||
use reqwest::{Client, header};
|
||
use serde_json::Value;
|
||
use tracing::{debug, warn};
|
||
|
||
const SOURCE_NAME: &str = "github-search";
|
||
const USER_AGENT: &str = concat!(
|
||
"moments/",
|
||
env!("CARGO_PKG_VERSION"),
|
||
" (+https://rob.tn)"
|
||
);
|
||
|
||
#[derive(Clone, Debug)]
|
||
pub struct GithubSearchConfig {
|
||
pub user: String,
|
||
pub token: Option<String>,
|
||
pub per_page: u32,
|
||
/// Hard cap on pages walked per query. The Search API itself only returns
|
||
/// the first 1000 results across pages, so 10 × 100 covers everything.
|
||
pub max_pages: u32,
|
||
}
|
||
|
||
impl Default for GithubSearchConfig {
|
||
fn default() -> Self {
|
||
Self {
|
||
user: "grenade".into(),
|
||
token: None,
|
||
per_page: 100,
|
||
max_pages: 10,
|
||
}
|
||
}
|
||
}
|
||
|
||
pub struct GithubSearchSource {
|
||
client: Client,
|
||
writer: Arc<dyn EventWriter>,
|
||
state: Arc<dyn PollerStateStore>,
|
||
config: GithubSearchConfig,
|
||
}
|
||
|
||
impl GithubSearchSource {
|
||
pub fn new(
|
||
client: Client,
|
||
writer: Arc<dyn EventWriter>,
|
||
state: Arc<dyn PollerStateStore>,
|
||
config: GithubSearchConfig,
|
||
) -> Self {
|
||
Self {
|
||
client,
|
||
writer,
|
||
state,
|
||
config,
|
||
}
|
||
}
|
||
|
||
fn apply_headers(&self, mut req: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
|
||
req = req
|
||
.header(header::ACCEPT, "application/vnd.github+json")
|
||
.header("X-GitHub-Api-Version", "2022-11-28")
|
||
.header(header::USER_AGENT, USER_AGENT);
|
||
if let Some(token) = &self.config.token {
|
||
req = req.header(header::AUTHORIZATION, format!("Bearer {token}"));
|
||
}
|
||
req
|
||
}
|
||
|
||
/// Read repo visibility from `/repos/{full_name}`. Used for results from
|
||
/// /search/issues, which don't include the visibility flag inline.
|
||
async fn fetch_repo_private(&self, full_name: &str) -> Result<bool, SourceError> {
|
||
let url = format!("https://api.github.com/repos/{full_name}");
|
||
let req = self.apply_headers(self.client.get(&url));
|
||
let resp = req
|
||
.send()
|
||
.await
|
||
.map_err(|e| SourceError::Http(e.to_string()))?;
|
||
if !resp.status().is_success() {
|
||
// Repo may be deleted / inaccessible. Treat as private (safer:
|
||
// we'd rather under-expose than over-expose).
|
||
return Err(SourceError::Http(format!("{} GET {}", resp.status(), url)));
|
||
}
|
||
let v: Value = resp
|
||
.json()
|
||
.await
|
||
.map_err(|e| SourceError::Parse(e.to_string()))?;
|
||
Ok(v.get("private").and_then(Value::as_bool).unwrap_or(false))
|
||
}
|
||
|
||
async fn search_commits(
|
||
&self,
|
||
vis_cache: &mut HashMap<String, bool>,
|
||
) -> Result<usize, SourceError> {
|
||
let mut total = 0usize;
|
||
for page in 1..=self.config.max_pages {
|
||
let url = format!(
|
||
"https://api.github.com/search/commits?q=author:{}&sort=author-date&order=desc&per_page={}&page={}",
|
||
self.config.user, self.config.per_page, page
|
||
);
|
||
let req = self.apply_headers(self.client.get(&url));
|
||
let resp = req
|
||
.send()
|
||
.await
|
||
.map_err(|e| SourceError::Http(e.to_string()))?;
|
||
if !resp.status().is_success() {
|
||
return Err(SourceError::Http(format!("{} GET {}", resp.status(), url)));
|
||
}
|
||
let body: Value = resp
|
||
.json()
|
||
.await
|
||
.map_err(|e| SourceError::Parse(e.to_string()))?;
|
||
let items = body
|
||
.get("items")
|
||
.and_then(Value::as_array)
|
||
.cloned()
|
||
.unwrap_or_default();
|
||
if items.is_empty() {
|
||
break;
|
||
}
|
||
|
||
// Dedup within the page by id (same commit in multiple forks
|
||
// returns multiple search items with the same SHA — postgres
|
||
// refuses ON CONFLICT when the conflict target appears twice
|
||
// in one INSERT).
|
||
let mut seen: HashSet<String> = HashSet::new();
|
||
let mut events = Vec::with_capacity(items.len());
|
||
for item in &items {
|
||
if let Some(ev) = parse_commit_event(item) {
|
||
if seen.insert(ev.id.clone()) {
|
||
// Opportunistically populate the visibility cache so
|
||
// search_issues can reuse it for the same repos.
|
||
if let Some(repo) = item
|
||
.get("repository")
|
||
.and_then(|r| r.get("full_name"))
|
||
.and_then(Value::as_str)
|
||
{
|
||
vis_cache.insert(repo.to_string(), !ev.public);
|
||
}
|
||
events.push(ev);
|
||
}
|
||
}
|
||
}
|
||
total += self.writer.upsert_events(&events).await?;
|
||
|
||
if items.len() < self.config.per_page as usize {
|
||
break;
|
||
}
|
||
}
|
||
Ok(total)
|
||
}
|
||
|
||
async fn search_issues(
|
||
&self,
|
||
vis_cache: &mut HashMap<String, bool>,
|
||
) -> Result<usize, SourceError> {
|
||
let mut total = 0usize;
|
||
for page in 1..=self.config.max_pages {
|
||
let url = format!(
|
||
"https://api.github.com/search/issues?q=author:{}&sort=created&order=desc&per_page={}&page={}",
|
||
self.config.user, self.config.per_page, page
|
||
);
|
||
let req = self.apply_headers(self.client.get(&url));
|
||
let resp = req
|
||
.send()
|
||
.await
|
||
.map_err(|e| SourceError::Http(e.to_string()))?;
|
||
if !resp.status().is_success() {
|
||
return Err(SourceError::Http(format!("{} GET {}", resp.status(), url)));
|
||
}
|
||
let body: Value = resp
|
||
.json()
|
||
.await
|
||
.map_err(|e| SourceError::Parse(e.to_string()))?;
|
||
let items = body
|
||
.get("items")
|
||
.and_then(Value::as_array)
|
||
.cloned()
|
||
.unwrap_or_default();
|
||
if items.is_empty() {
|
||
break;
|
||
}
|
||
|
||
let mut events = Vec::with_capacity(items.len());
|
||
for item in &items {
|
||
if let Some(ev) = self.search_issue_to_event(item, vis_cache).await {
|
||
events.push(ev);
|
||
}
|
||
}
|
||
total += self.writer.upsert_events(&events).await?;
|
||
|
||
// Last page if we got fewer than per_page items.
|
||
if items.len() < self.config.per_page as usize {
|
||
break;
|
||
}
|
||
}
|
||
Ok(total)
|
||
}
|
||
|
||
async fn search_issue_to_event(
|
||
&self,
|
||
item: &Value,
|
||
vis_cache: &mut HashMap<String, bool>,
|
||
) -> Option<Event> {
|
||
let number = item.get("number").and_then(Value::as_i64)?;
|
||
let html_url = item.get("html_url").and_then(Value::as_str)?;
|
||
let created_at_str = item.get("created_at").and_then(Value::as_str)?;
|
||
let occurred_at = DateTime::parse_from_rfc3339(created_at_str)
|
||
.ok()?
|
||
.with_timezone(&Utc);
|
||
let repo = repo_from_html_url(html_url)?;
|
||
|
||
let private = match vis_cache.get(&repo).copied() {
|
||
Some(p) => p,
|
||
None => match self.fetch_repo_private(&repo).await {
|
||
Ok(p) => {
|
||
vis_cache.insert(repo.clone(), p);
|
||
p
|
||
}
|
||
Err(e) => {
|
||
warn!(repo = %repo, error = %e, "repo visibility lookup failed; treating as private");
|
||
vis_cache.insert(repo.clone(), true);
|
||
true
|
||
}
|
||
},
|
||
};
|
||
|
||
let action = if item.get("pull_request").is_some() {
|
||
"PullRequest"
|
||
} else {
|
||
"Issue"
|
||
};
|
||
|
||
Some(Event {
|
||
id: format!("github-issue:{repo}#{number}"),
|
||
source: Source::Github,
|
||
action: action.into(),
|
||
occurred_at,
|
||
public: !private,
|
||
payload: item.clone(),
|
||
})
|
||
}
|
||
}
|
||
|
||
#[async_trait]
|
||
impl EventSource for GithubSearchSource {
|
||
fn name(&self) -> &'static str {
|
||
SOURCE_NAME
|
||
}
|
||
|
||
async fn poll(&self) -> Result<usize, SourceError> {
|
||
let mut vis_cache: HashMap<String, bool> = HashMap::new();
|
||
// Run commits first so vis_cache is partly seeded with inline-flag
|
||
// visibility before the issue loop hits its (more expensive) per-repo
|
||
// lookups.
|
||
let commits = self.search_commits(&mut vis_cache).await?;
|
||
let issues = self.search_issues(&mut vis_cache).await?;
|
||
self.state.touch(SOURCE_NAME).await?;
|
||
debug!(
|
||
commits,
|
||
issues,
|
||
unique_repos = vis_cache.len(),
|
||
"github-search poll complete"
|
||
);
|
||
Ok(commits + issues)
|
||
}
|
||
}
|
||
|
||
/// Convert a /search/commits item into our Event row. Returns None if the
|
||
/// item is missing required fields.
|
||
fn parse_commit_event(item: &Value) -> Option<Event> {
|
||
let sha = item.get("sha").and_then(Value::as_str)?;
|
||
let html_url = item.get("html_url").and_then(Value::as_str)?;
|
||
// Prefer author.date — it's when the work was written; committer.date
|
||
// can shift on rebase. Either is RFC3339.
|
||
let date_str = item
|
||
.get("commit")
|
||
.and_then(|c| c.get("author"))
|
||
.and_then(|a| a.get("date"))
|
||
.and_then(Value::as_str)
|
||
.or_else(|| {
|
||
item.get("commit")
|
||
.and_then(|c| c.get("committer"))
|
||
.and_then(|c| c.get("date"))
|
||
.and_then(Value::as_str)
|
||
})?;
|
||
let occurred_at = DateTime::parse_from_rfc3339(date_str)
|
||
.ok()?
|
||
.with_timezone(&Utc);
|
||
let private = item
|
||
.get("repository")
|
||
.and_then(|r| r.get("private"))
|
||
.and_then(Value::as_bool)
|
||
.unwrap_or(false);
|
||
|
||
// Sanity-check the html_url points at github.com so we don't ingest
|
||
// garbage if GitHub ever changes its URL shape.
|
||
if !html_url.starts_with("https://github.com/") {
|
||
return None;
|
||
}
|
||
|
||
Some(Event {
|
||
id: format!("github-commit:{sha}"),
|
||
source: Source::Github,
|
||
action: "Commit".into(),
|
||
occurred_at,
|
||
public: !private,
|
||
payload: item.clone(),
|
||
})
|
||
}
|
||
|
||
/// Extract `owner/repo` from a github.com URL like
|
||
/// `https://github.com/owner/repo/{issues,pull}/42`.
|
||
fn repo_from_html_url(url: &str) -> Option<String> {
|
||
let stripped = url.strip_prefix("https://github.com/")?;
|
||
let mut parts = stripped.splitn(3, '/');
|
||
let owner = parts.next()?;
|
||
let repo = parts.next()?;
|
||
if owner.is_empty() || repo.is_empty() {
|
||
return None;
|
||
}
|
||
Some(format!("{owner}/{repo}"))
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
#[test]
|
||
fn extracts_repo_from_html_url() {
|
||
assert_eq!(
|
||
repo_from_html_url("https://github.com/Nehliin/vortex/issues/125").as_deref(),
|
||
Some("Nehliin/vortex")
|
||
);
|
||
assert_eq!(
|
||
repo_from_html_url("https://github.com/grenade/moments/pull/3").as_deref(),
|
||
Some("grenade/moments")
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn rejects_non_github_host() {
|
||
assert!(repo_from_html_url("https://gitlab.com/x/y/-/issues/1").is_none());
|
||
}
|
||
|
||
#[test]
|
||
fn parse_commit_uses_sha_as_id() {
|
||
let raw = serde_json::json!({
|
||
"sha": "a6fcefbe909a97ad5a049b9fa48bc74309af10d9",
|
||
"html_url": "https://github.com/faith1337z/Trade/commit/a6fcefbe909a97ad5a049b9fa48bc74309af10d9",
|
||
"commit": {
|
||
"author": { "name": "rob", "date": "2017-11-13T23:32:31+02:00" },
|
||
"committer": { "name": "rob", "date": "2017-11-13T22:32:31+01:00" },
|
||
"message": "split multiline message into multiple irc messages"
|
||
},
|
||
"repository": { "full_name": "faith1337z/Trade", "private": false }
|
||
});
|
||
let ev = parse_commit_event(&raw).expect("parses");
|
||
assert_eq!(ev.id, "github-commit:a6fcefbe909a97ad5a049b9fa48bc74309af10d9");
|
||
assert_eq!(ev.action, "Commit");
|
||
assert!(ev.public);
|
||
}
|
||
|
||
#[test]
|
||
fn parse_commit_marks_private_repo() {
|
||
let raw = serde_json::json!({
|
||
"sha": "deadbeef",
|
||
"html_url": "https://github.com/grenade/private-repo/commit/deadbeef",
|
||
"commit": {
|
||
"author": { "date": "2024-01-01T00:00:00Z" },
|
||
"message": "x"
|
||
},
|
||
"repository": { "full_name": "grenade/private-repo", "private": true }
|
||
});
|
||
let ev = parse_commit_event(&raw).expect("parses");
|
||
assert!(!ev.public);
|
||
}
|
||
|
||
#[test]
|
||
fn parse_commit_rejects_non_github_url() {
|
||
let raw = serde_json::json!({
|
||
"sha": "abc",
|
||
"html_url": "https://example.com/x/commit/abc",
|
||
"commit": { "author": { "date": "2024-01-01T00:00:00Z" } },
|
||
"repository": { "full_name": "x/y", "private": false }
|
||
});
|
||
assert!(parse_commit_event(&raw).is_none());
|
||
}
|
||
}
|