Files
moments/crates/moments-data/src/github_repo.rs
rob thijssen ee93429317 feat: language stream graph on dashboard
Full-stack feature showing programming languages by commit activity
as a stream graph on the dashboard.

Backend:
- migration: repo_languages table (source, repo, language, bytes, color)
- worker: fetch language breakdowns via GitHub GraphQL (batched,
  20 repos/request) and Gitea REST API during poll cycles
- API: GET /v1/languages/daily (daily commit counts per language),
  GET /v1/languages/repos (all stored repo language data)
- fix timezone bug in daily_counts and language_daily_counts: the
  PostgreSQL server timezone (Europe/Sofia, UTC+3) shifted day
  boundaries, miscounting events near midnight. Now uses explicit
  UTC boundaries in generate_series JOINs.
- use per-source CASE for repo name extraction in language query
  to match gitea payload structure (repo.full_name vs repo.name)
- Gitea languages use GitHub colors via COALESCE fallback

Frontend:
- LanguageStreamGraph component: pure SVG stream graph, weekly
  buckets, centered baseline, top 8 languages + Other, GitHub
  canonical language colors, legend with color dots
- DashPage/ProjectPage: fetch repo languages once via new endpoint
  instead of per-repo forge proxy calls (eliminates 200+ GitHub
  API calls and 403 rate limit errors)
- removed fetchLanguages forge proxy wrapper (dead code)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-06 06:27:59 +03:00

555 lines
19 KiB
Rust

//! Per-repo commit enumeration for full GitHub history.
//!
//! Discovers repos via two sources:
//! 1. REST `/user/repos` — repos where the user is owner, collaborator,
//! or org member.
//! 2. GraphQL `repositoriesContributedTo` — repos the user has committed
//! to, opened issues/PRs on, or reviewed, even without collaborator
//! status. No result cap (cursor-paginated).
//!
//! Then walks each repo's commit history via
//! `/repos/{owner}/{repo}/commits?author={user}` with a `since` cursor
//! to avoid re-fetching known commits.
//!
//! Events use `github-commit:{sha}` as their ID, matching the scheme in
//! `github_search`, so duplicates are resolved via idempotent upsert.
use std::collections::HashSet;
use std::sync::Arc;
use async_trait::async_trait;
use chrono::{DateTime, Utc};
use moments_core::{EventSource, EventWriter, PollerStateStore, SourceError};
use moments_entities::{Event, RepoLanguage, Source};
use reqwest::{Client, header};
use serde_json::Value;
use tracing::{debug, warn};
const SOURCE_NAME: &str = "github-repo";
const USER_AGENT: &str = concat!(
"moments/",
env!("CARGO_PKG_VERSION"),
" (+https://rob.tn)"
);
const MAX_BACKFILL_PAGES: u32 = 100;
#[derive(Clone, Debug)]
pub struct GithubRepoConfig {
pub user: String,
pub token: Option<String>,
pub per_page: u32,
}
impl Default for GithubRepoConfig {
fn default() -> Self {
Self {
user: "grenade".into(),
token: None,
per_page: 100,
}
}
}
pub struct GithubRepoSource {
client: Client,
writer: Arc<dyn EventWriter>,
state: Arc<dyn PollerStateStore>,
config: GithubRepoConfig,
}
impl GithubRepoSource {
pub fn new(
client: Client,
writer: Arc<dyn EventWriter>,
state: Arc<dyn PollerStateStore>,
config: GithubRepoConfig,
) -> Self {
Self {
client,
writer,
state,
config,
}
}
fn apply_headers(&self, mut req: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
req = req
.header(header::ACCEPT, "application/vnd.github+json")
.header("X-GitHub-Api-Version", "2022-11-28")
.header(header::USER_AGENT, USER_AGENT);
if let Some(token) = &self.config.token {
req = req.header(header::AUTHORIZATION, format!("Bearer {token}"));
}
req
}
/// Discover all repos the authenticated user can access.
async fn discover_repos(&self) -> Result<Vec<Repo>, SourceError> {
if self.config.token.is_none() {
return Ok(vec![]);
}
let mut repos = Vec::new();
for page in 1..=50 {
let url = format!(
"https://api.github.com/user/repos?affiliation=owner,collaborator,organization_member&visibility=all&per_page={}&page={}",
self.config.per_page, page
);
let req = self.apply_headers(self.client.get(&url));
let resp = req
.send()
.await
.map_err(|e| SourceError::Http(e.to_string()))?;
if !resp.status().is_success() {
return Err(SourceError::Http(format!("{} GET {}", resp.status(), url)));
}
let items: Vec<Value> = resp
.json()
.await
.map_err(|e| SourceError::Parse(e.to_string()))?;
if items.is_empty() {
break;
}
for item in &items {
if let Some(r) = parse_repo(item) {
repos.push(r);
}
}
if items.len() < self.config.per_page as usize {
break;
}
}
// Supplement with repos from GraphQL repositoriesContributedTo.
// This catches repos where the user contributed via PRs but isn't
// an owner, collaborator, or org member — no result cap.
let mut known: HashSet<String> = repos.iter().map(|r| r.full_name.clone()).collect();
let contributed = self.discover_contributed_repos().await;
match contributed {
Ok(extra) => {
for r in extra {
if known.insert(r.full_name.clone()) {
repos.push(r);
}
}
}
Err(e) => {
warn!(error = %e, "GraphQL contributed-repos discovery failed; continuing with known repos");
}
}
Ok(repos)
}
/// Discover repos the user has contributed to via GraphQL.
/// Uses cursor-based pagination with no result cap.
async fn discover_contributed_repos(&self) -> Result<Vec<Repo>, SourceError> {
let token = match &self.config.token {
Some(t) => t,
None => return Ok(vec![]),
};
let mut repos = Vec::new();
let mut cursor: Option<String> = None;
loop {
let after = match &cursor {
Some(c) => format!(", after: \"{}\"", c),
None => String::new(),
};
let query = format!(
r#"{{ user(login: "{}") {{ repositoriesContributedTo(first: 100, contributionTypes: [COMMIT, PULL_REQUEST, ISSUE]{}) {{ pageInfo {{ hasNextPage endCursor }} nodes {{ nameWithOwner isPrivate }} }} }} }}"#,
self.config.user, after
);
let body = serde_json::json!({ "query": query });
let resp = self
.client
.post("https://api.github.com/graphql")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header(header::USER_AGENT, USER_AGENT)
.header(header::CONTENT_TYPE, "application/json")
.json(&body)
.send()
.await
.map_err(|e| SourceError::Http(e.to_string()))?;
if !resp.status().is_success() {
return Err(SourceError::Http(format!(
"{} POST graphql",
resp.status()
)));
}
let data: Value = resp
.json()
.await
.map_err(|e| SourceError::Parse(e.to_string()))?;
// Check for GraphQL-level errors
if let Some(errors) = data.get("errors").and_then(Value::as_array) {
if let Some(msg) = errors.first().and_then(|e| e.get("message")).and_then(Value::as_str) {
return Err(SourceError::Http(format!("GraphQL error: {msg}")));
}
}
let contributed = &data["data"]["user"]["repositoriesContributedTo"];
let nodes = contributed["nodes"].as_array();
if let Some(nodes) = nodes {
for node in nodes {
let full_name = node
.get("nameWithOwner")
.and_then(Value::as_str);
let private = node
.get("isPrivate")
.and_then(Value::as_bool)
.unwrap_or(false);
if let Some(name) = full_name {
repos.push(Repo {
full_name: name.to_string(),
private,
});
}
}
}
let has_next = contributed["pageInfo"]["hasNextPage"]
.as_bool()
.unwrap_or(false);
if !has_next {
break;
}
cursor = contributed["pageInfo"]["endCursor"]
.as_str()
.map(String::from);
}
debug!(repos = repos.len(), "discovered contributed repos via GraphQL");
Ok(repos)
}
/// Fetch commits for a single repo, paginating fully on first run
/// and using `since` on subsequent runs to catch everything new.
async fn scan_repo(&self, repo: &Repo) -> Result<usize, SourceError> {
let state_key = format!("github-repo:{}", repo.full_name);
let prior = self.state.load(&state_key).await?;
let since = prior.as_ref().and_then(|s| s.last_modified);
let mut total = 0usize;
let mut newest: Option<DateTime<Utc>> = since;
for page in 1..=MAX_BACKFILL_PAGES {
let mut url = format!(
"https://api.github.com/repos/{}/commits?author={}&per_page={}&page={}",
repo.full_name, self.config.user, self.config.per_page, page
);
if let Some(since_dt) = since {
url.push_str(&format!("&since={}", since_dt.to_rfc3339()));
}
let req = self.apply_headers(self.client.get(&url));
let resp = req
.send()
.await
.map_err(|e| SourceError::Http(e.to_string()))?;
let status = resp.status();
// 409 = empty repo (no commits at all), not an error
if status.as_u16() == 409 {
break;
}
if status.as_u16() == 403 || status.as_u16() == 429 {
warn!(repo = %repo.full_name, status = %status, "rate limited; stopping early");
return Err(SourceError::Http(format!("{} GET {}", status, url)));
}
if status.as_u16() == 404 {
warn!(repo = %repo.full_name, "repo not found; skipping");
break;
}
if !status.is_success() {
return Err(SourceError::Http(format!("{} GET {}", status, url)));
}
let items: Vec<Value> = resp
.json()
.await
.map_err(|e| SourceError::Parse(e.to_string()))?;
if items.is_empty() {
break;
}
let events: Vec<Event> = items
.iter()
.filter_map(|item| parse_commit(item, repo))
.collect();
for ev in &events {
newest = Some(match newest {
Some(n) if ev.occurred_at > n => ev.occurred_at,
Some(n) => n,
None => ev.occurred_at,
});
}
total += self.writer.upsert_events(&events).await?;
if items.len() < self.config.per_page as usize {
break;
}
}
self.state.save(&state_key, None, newest).await?;
Ok(total)
}
/// Batch-fetch language breakdowns for repos via GraphQL, upserting
/// into repo_languages. Repos are batched using GraphQL aliases to
/// minimise round trips.
async fn fetch_languages(&self, repos: &[Repo]) -> Result<usize, SourceError> {
let token = match &self.config.token {
Some(t) => t,
None => return Ok(0),
};
let mut total = 0usize;
for chunk in repos.chunks(20) {
let mut fragments = Vec::with_capacity(chunk.len());
for (i, repo) in chunk.iter().enumerate() {
let parts: Vec<&str> = repo.full_name.splitn(2, '/').collect();
if parts.len() != 2 {
continue;
}
fragments.push(format!(
r#"r{i}: repository(owner: "{}", name: "{}") {{ languages(first: 20, orderBy: {{field: SIZE, direction: DESC}}) {{ edges {{ size node {{ name color }} }} }} }}"#,
parts[0], parts[1]
));
}
if fragments.is_empty() {
continue;
}
let query = format!("{{ {} }}", fragments.join(" "));
let body = serde_json::json!({ "query": query });
let resp = self
.client
.post("https://api.github.com/graphql")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header(header::USER_AGENT, USER_AGENT)
.header(header::CONTENT_TYPE, "application/json")
.json(&body)
.send()
.await
.map_err(|e| SourceError::Http(e.to_string()))?;
if !resp.status().is_success() {
warn!(status = %resp.status(), "GraphQL language fetch failed");
break;
}
let data: Value = resp
.json()
.await
.map_err(|e| SourceError::Parse(e.to_string()))?;
if let Some(errors) = data.get("errors").and_then(Value::as_array) {
if let Some(msg) = errors.first().and_then(|e| e.get("message")).and_then(Value::as_str) {
warn!(error = %msg, "GraphQL language fetch had errors");
}
}
let data_obj = match data.get("data") {
Some(d) => d,
None => continue,
};
let mut languages = Vec::new();
for (i, repo) in chunk.iter().enumerate() {
let alias = format!("r{i}");
let edges = data_obj
.get(&alias)
.and_then(|r| r.get("languages"))
.and_then(|l| l.get("edges"))
.and_then(Value::as_array);
if let Some(edges) = edges {
for edge in edges {
let size = edge.get("size").and_then(Value::as_i64).unwrap_or(0);
let name = edge
.get("node")
.and_then(|n| n.get("name"))
.and_then(Value::as_str);
let color = edge
.get("node")
.and_then(|n| n.get("color"))
.and_then(Value::as_str);
if let Some(name) = name {
languages.push(RepoLanguage {
source: Source::Github,
repo: repo.full_name.clone(),
language: name.to_string(),
bytes: size,
color: color.map(String::from),
});
}
}
}
}
total += self.writer.upsert_repo_languages(&languages).await?;
}
debug!(total, "repo languages updated");
Ok(total)
}
}
#[async_trait]
impl EventSource for GithubRepoSource {
fn name(&self) -> &'static str {
SOURCE_NAME
}
async fn poll(&self) -> Result<usize, SourceError> {
let repos = self.discover_repos().await?;
debug!(repos = repos.len(), "discovered github repos");
let mut total = 0usize;
for repo in &repos {
match self.scan_repo(repo).await {
Ok(n) => {
if n > 0 {
debug!(repo = %repo.full_name, ingested = n, "repo commit scan complete");
}
total += n;
}
Err(SourceError::Http(ref msg)) if msg.starts_with("403") || msg.starts_with("429") => {
warn!("rate limited during repo scan; ending poll early");
break;
}
Err(e) => {
warn!(repo = %repo.full_name, error = %e, "repo scan failed; continuing");
}
}
}
if let Err(e) = self.fetch_languages(&repos).await {
warn!(error = %e, "language fetch failed; continuing");
}
self.state.touch(SOURCE_NAME).await?;
debug!(ingested = total, repos = repos.len(), "github-repo poll complete");
Ok(total)
}
}
#[derive(Debug, Clone)]
struct Repo {
full_name: String,
private: bool,
}
fn parse_repo(item: &Value) -> Option<Repo> {
let full_name = item.get("full_name").and_then(Value::as_str)?;
let private = item.get("private").and_then(Value::as_bool).unwrap_or(false);
Some(Repo {
full_name: full_name.to_string(),
private,
})
}
fn parse_commit(item: &Value, repo: &Repo) -> Option<Event> {
let sha = item.get("sha").and_then(Value::as_str)?;
let date_str = item
.get("commit")
.and_then(|c| c.get("author"))
.and_then(|a| a.get("date"))
.and_then(Value::as_str)
.or_else(|| {
item.get("commit")
.and_then(|c| c.get("committer"))
.and_then(|c| c.get("date"))
.and_then(Value::as_str)
})?;
let occurred_at = DateTime::parse_from_rfc3339(date_str)
.ok()?
.with_timezone(&Utc);
let mut payload = item.clone();
if let Some(obj) = payload.as_object_mut() {
obj.insert("_repo".into(), Value::String(repo.full_name.clone()));
}
Some(Event {
id: format!("github-commit:{sha}"),
source: Source::Github,
action: "Commit".into(),
occurred_at,
public: !repo.private,
payload,
})
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn parse_commit_uses_sha_as_id() {
let repo = Repo {
full_name: "grenade/moments".into(),
private: false,
};
let raw = json!({
"sha": "abc123",
"commit": {
"author": { "date": "2024-01-15T10:30:00Z" },
"message": "fix something"
}
});
let ev = parse_commit(&raw, &repo).expect("parses");
assert_eq!(ev.id, "github-commit:abc123");
assert_eq!(ev.action, "Commit");
assert!(ev.public);
}
#[test]
fn parse_commit_private_repo() {
let repo = Repo {
full_name: "grenade/secret".into(),
private: true,
};
let raw = json!({
"sha": "def456",
"commit": {
"author": { "date": "2024-01-15T10:30:00Z" },
"message": "secret change"
}
});
let ev = parse_commit(&raw, &repo).expect("parses");
assert!(!ev.public);
}
#[test]
fn parse_commit_falls_back_to_committer_date() {
let repo = Repo {
full_name: "grenade/moments".into(),
private: false,
};
let raw = json!({
"sha": "ghi789",
"commit": {
"committer": { "date": "2024-02-01T12:00:00Z" },
"message": "no author date"
}
});
let ev = parse_commit(&raw, &repo).expect("parses");
assert_eq!(ev.id, "github-commit:ghi789");
}
#[test]
fn parse_repo_extracts_fields() {
let raw = json!({
"full_name": "grenade/moments",
"private": false
});
let repo = parse_repo(&raw).expect("parses");
assert_eq!(repo.full_name, "grenade/moments");
assert!(!repo.private);
}
}