//! GitHub Search API ingestion for historical backfill. //! //! The Events API caps at 90 days; this source uses `/search/issues` with //! `author:` to recover issues and PRs going back as far as GitHub //! retains them (1000-result ceiling per the Search API's hard cap). //! //! `/search/commits` is deliberately not used: GitHub matches the same commit //! across every fork that contains it, inflating result counts and surfacing //! commits in repos the user never authored to. If commit history becomes //! desirable we should enumerate the user's repos and walk per-repo //! `/repos/{o}/{r}/commits?author=...` instead. use std::collections::HashMap; use std::sync::Arc; use async_trait::async_trait; use chrono::{DateTime, Utc}; use moments_core::{EventSource, EventWriter, PollerStateStore, SourceError}; use moments_entities::{Event, Source}; use reqwest::{Client, header}; use serde_json::Value; use tracing::{debug, warn}; const SOURCE_NAME: &str = "github-search"; const USER_AGENT: &str = concat!( "moments/", env!("CARGO_PKG_VERSION"), " (+https://rob.tn)" ); #[derive(Clone, Debug)] pub struct GithubSearchConfig { pub user: String, pub token: Option, pub per_page: u32, /// Hard cap on pages walked per query. The Search API itself only returns /// the first 1000 results across pages, so 10 × 100 covers everything. pub max_pages: u32, } impl Default for GithubSearchConfig { fn default() -> Self { Self { user: "grenade".into(), token: None, per_page: 100, max_pages: 10, } } } pub struct GithubSearchSource { client: Client, writer: Arc, state: Arc, config: GithubSearchConfig, } impl GithubSearchSource { pub fn new( client: Client, writer: Arc, state: Arc, config: GithubSearchConfig, ) -> Self { Self { client, writer, state, config, } } fn apply_headers(&self, mut req: reqwest::RequestBuilder) -> reqwest::RequestBuilder { req = req .header(header::ACCEPT, "application/vnd.github+json") .header("X-GitHub-Api-Version", "2022-11-28") .header(header::USER_AGENT, USER_AGENT); if let Some(token) = &self.config.token { req = req.header(header::AUTHORIZATION, format!("Bearer {token}")); } req } /// Read repo visibility from `/repos/{full_name}`. Used for results from /// /search/issues, which don't include the visibility flag inline. async fn fetch_repo_private(&self, full_name: &str) -> Result { let url = format!("https://api.github.com/repos/{full_name}"); let req = self.apply_headers(self.client.get(&url)); let resp = req .send() .await .map_err(|e| SourceError::Http(e.to_string()))?; if !resp.status().is_success() { // Repo may be deleted / inaccessible. Treat as private (safer: // we'd rather under-expose than over-expose). return Err(SourceError::Http(format!("{} GET {}", resp.status(), url))); } let v: Value = resp .json() .await .map_err(|e| SourceError::Parse(e.to_string()))?; Ok(v.get("private").and_then(Value::as_bool).unwrap_or(false)) } async fn search_issues( &self, vis_cache: &mut HashMap, ) -> Result { let mut total = 0usize; for page in 1..=self.config.max_pages { let url = format!( "https://api.github.com/search/issues?q=author:{}&sort=created&order=desc&per_page={}&page={}", self.config.user, self.config.per_page, page ); let req = self.apply_headers(self.client.get(&url)); let resp = req .send() .await .map_err(|e| SourceError::Http(e.to_string()))?; if !resp.status().is_success() { return Err(SourceError::Http(format!("{} GET {}", resp.status(), url))); } let body: Value = resp .json() .await .map_err(|e| SourceError::Parse(e.to_string()))?; let items = body .get("items") .and_then(Value::as_array) .cloned() .unwrap_or_default(); if items.is_empty() { break; } let mut events = Vec::with_capacity(items.len()); for item in &items { if let Some(ev) = self.search_issue_to_event(item, vis_cache).await { events.push(ev); } } total += self.writer.upsert_events(&events).await?; // Last page if we got fewer than per_page items. if items.len() < self.config.per_page as usize { break; } } Ok(total) } async fn search_issue_to_event( &self, item: &Value, vis_cache: &mut HashMap, ) -> Option { let number = item.get("number").and_then(Value::as_i64)?; let html_url = item.get("html_url").and_then(Value::as_str)?; let created_at_str = item.get("created_at").and_then(Value::as_str)?; let occurred_at = DateTime::parse_from_rfc3339(created_at_str) .ok()? .with_timezone(&Utc); let repo = repo_from_html_url(html_url)?; let private = match vis_cache.get(&repo).copied() { Some(p) => p, None => match self.fetch_repo_private(&repo).await { Ok(p) => { vis_cache.insert(repo.clone(), p); p } Err(e) => { warn!(repo = %repo, error = %e, "repo visibility lookup failed; treating as private"); vis_cache.insert(repo.clone(), true); true } }, }; let action = if item.get("pull_request").is_some() { "PullRequest" } else { "Issue" }; Some(Event { id: format!("github-issue:{repo}#{number}"), source: Source::Github, action: action.into(), occurred_at, public: !private, payload: item.clone(), }) } } #[async_trait] impl EventSource for GithubSearchSource { fn name(&self) -> &'static str { SOURCE_NAME } async fn poll(&self) -> Result { let mut vis_cache: HashMap = HashMap::new(); let total = self.search_issues(&mut vis_cache).await?; self.state.touch(SOURCE_NAME).await?; debug!( ingested = total, unique_repos = vis_cache.len(), "github-search poll complete" ); Ok(total) } } /// Extract `owner/repo` from a github.com URL like /// `https://github.com/owner/repo/{issues,pull}/42`. fn repo_from_html_url(url: &str) -> Option { let stripped = url.strip_prefix("https://github.com/")?; let mut parts = stripped.splitn(3, '/'); let owner = parts.next()?; let repo = parts.next()?; if owner.is_empty() || repo.is_empty() { return None; } Some(format!("{owner}/{repo}")) } #[cfg(test)] mod tests { use super::*; #[test] fn extracts_repo_from_html_url() { assert_eq!( repo_from_html_url("https://github.com/Nehliin/vortex/issues/125").as_deref(), Some("Nehliin/vortex") ); assert_eq!( repo_from_html_url("https://github.com/grenade/moments/pull/3").as_deref(), Some("grenade/moments") ); } #[test] fn rejects_non_github_host() { assert!(repo_from_html_url("https://gitlab.com/x/y/-/issues/1").is_none()); } }