Files
moments/crates/moments-data/src/gitea.rs
rob thijssen ee93429317 feat: language stream graph on dashboard
Full-stack feature showing programming languages by commit activity
as a stream graph on the dashboard.

Backend:
- migration: repo_languages table (source, repo, language, bytes, color)
- worker: fetch language breakdowns via GitHub GraphQL (batched,
  20 repos/request) and Gitea REST API during poll cycles
- API: GET /v1/languages/daily (daily commit counts per language),
  GET /v1/languages/repos (all stored repo language data)
- fix timezone bug in daily_counts and language_daily_counts: the
  PostgreSQL server timezone (Europe/Sofia, UTC+3) shifted day
  boundaries, miscounting events near midnight. Now uses explicit
  UTC boundaries in generate_series JOINs.
- use per-source CASE for repo name extraction in language query
  to match gitea payload structure (repo.full_name vs repo.name)
- Gitea languages use GitHub colors via COALESCE fallback

Frontend:
- LanguageStreamGraph component: pure SVG stream graph, weekly
  buckets, centered baseline, top 8 languages + Other, GitHub
  canonical language colors, legend with color dots
- DashPage/ProjectPage: fetch repo languages once via new endpoint
  instead of per-repo forge proxy calls (eliminates 200+ GitHub
  API calls and 403 rate limit errors)
- removed fetchLanguages forge proxy wrapper (dead code)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-06 06:27:59 +03:00

375 lines
13 KiB
Rust

//! Gitea activity feed ingestion.
//!
//! Hits `/api/v1/users/{user}/activities/feeds?only-performed-by=true`
//! which returns events the user themselves caused (not received events
//! from others they follow). No ETag support upstream, so each tick fetches
//! page 1 and relies on idempotent upsert. First run paginates further to
//! seed history.
//!
//! Each item carries a self-contained payload — including the event-emitting
//! host — so the reshape layer can construct URLs without needing config.
use std::collections::HashSet;
use std::sync::Arc;
use async_trait::async_trait;
use chrono::{DateTime, Utc};
use moments_core::{EventSource, EventWriter, PollerStateStore, SourceError};
use moments_entities::{Event, RepoLanguage, Source};
use reqwest::{Client, header};
use serde_json::Value;
use tracing::debug;
const SOURCE_NAME: &str = "gitea";
const USER_AGENT: &str = concat!(
"moments/",
env!("CARGO_PKG_VERSION"),
" (+https://rob.tn)"
);
const MAX_BACKFILL_PAGES: u32 = 20;
#[derive(Clone, Debug)]
pub struct GiteaConfig {
/// e.g. `git.lair.cafe`. Used to construct URLs the API doesn't return
/// directly (issue / PR / commit web links) and stamped into each event
/// payload for the reshape layer.
pub host: String,
pub user: String,
pub token: Option<String>,
pub per_page: u32,
}
impl Default for GiteaConfig {
fn default() -> Self {
Self {
host: "git.lair.cafe".into(),
user: "grenade".into(),
token: None,
per_page: 50,
}
}
}
pub struct GiteaSource {
client: Client,
writer: Arc<dyn EventWriter>,
state: Arc<dyn PollerStateStore>,
config: GiteaConfig,
}
impl GiteaSource {
pub fn new(
client: Client,
writer: Arc<dyn EventWriter>,
state: Arc<dyn PollerStateStore>,
config: GiteaConfig,
) -> Self {
Self {
client,
writer,
state,
config,
}
}
fn user_feed_base_url(&self) -> String {
format!(
"https://{}/api/v1/users/{}/activities/feeds?only-performed-by=true&limit={}",
self.config.host, self.config.user, self.config.per_page
)
}
fn org_feed_base_url(&self, org: &str) -> String {
format!(
"https://{}/api/v1/orgs/{}/activities/feeds?limit={}",
self.config.host, org, self.config.per_page
)
}
fn apply_headers(&self, mut req: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
req = req
.header(header::ACCEPT, "application/json")
.header(header::USER_AGENT, USER_AGENT);
if let Some(token) = &self.config.token {
req = req.header(header::AUTHORIZATION, format!("token {token}"));
}
req
}
/// Discover organizations the authenticated user belongs to.
/// Returns an empty vec if no token is configured or the request fails.
async fn discover_orgs(&self) -> Result<Vec<String>, SourceError> {
if self.config.token.is_none() {
return Ok(vec![]);
}
let url = format!("https://{}/api/v1/user/orgs", self.config.host);
let req = self.apply_headers(self.client.get(&url));
let resp = req
.send()
.await
.map_err(|e| SourceError::Http(e.to_string()))?;
if !resp.status().is_success() {
tracing::warn!(status = %resp.status(), "failed to discover gitea orgs");
return Ok(vec![]);
}
let orgs: Vec<Value> = resp
.json()
.await
.map_err(|e| SourceError::Parse(e.to_string()))?;
Ok(orgs
.iter()
.filter_map(|o| o.get("username").and_then(Value::as_str).map(String::from))
.collect())
}
/// Poll a single activity feed, paginating on first run. When `filter_user`
/// is true, only events performed by `self.config.user` are ingested (used
/// for org feeds which contain all members' activity).
///
/// `base_url` should contain everything except the `&page=N` suffix.
/// Returns (ingested_count, set_of_repo_full_names).
async fn poll_feed(
&self,
state_key: &str,
base_url: &str,
filter_user: bool,
) -> Result<(usize, HashSet<String>), SourceError> {
let prior = self.state.load(state_key).await?;
let first_run = prior.is_none();
let max_pages = if first_run { MAX_BACKFILL_PAGES } else { 1 };
let mut total = 0usize;
let mut repos = HashSet::new();
for page in 1..=max_pages {
let url = format!("{base_url}&page={page}");
let req = self.apply_headers(self.client.get(&url));
let resp = req
.send()
.await
.map_err(|e| SourceError::Http(e.to_string()))?;
if !resp.status().is_success() {
return Err(SourceError::Http(format!("{} GET {}", resp.status(), url)));
}
let items: Vec<Value> = resp
.json()
.await
.map_err(|e| SourceError::Parse(e.to_string()))?;
if items.is_empty() {
break;
}
// Collect repo names from feed items
for item in &items {
if let Some(name) = item
.get("repo")
.and_then(|r| r.get("full_name"))
.and_then(Value::as_str)
{
repos.insert(name.to_string());
}
}
let events: Vec<Event> = items
.iter()
.filter(|it| {
if !filter_user {
return true;
}
it.get("act_user")
.and_then(|u| u.get("login"))
.and_then(Value::as_str)
.map(|login| login.eq_ignore_ascii_case(&self.config.user))
.unwrap_or(false)
})
.filter_map(|it| parse_gitea_event(it, &self.config.host))
.collect();
total += self.writer.upsert_events(&events).await?;
if items.len() < self.config.per_page as usize {
break;
}
}
self.state.touch(state_key).await?;
Ok((total, repos))
}
/// Fetch language breakdowns for the given repos via the Gitea REST API.
async fn fetch_languages(&self, repos: &HashSet<String>) -> Result<usize, SourceError> {
let mut total = 0usize;
for repo in repos {
let url = format!(
"https://{}/api/v1/repos/{}/languages",
self.config.host, repo
);
let req = self.apply_headers(self.client.get(&url));
let resp = req
.send()
.await
.map_err(|e| SourceError::Http(e.to_string()))?;
if !resp.status().is_success() {
tracing::warn!(repo = %repo, status = %resp.status(), "gitea language fetch failed; skipping");
continue;
}
let lang_map: std::collections::HashMap<String, i64> = resp
.json()
.await
.map_err(|e| SourceError::Parse(e.to_string()))?;
let languages: Vec<RepoLanguage> = lang_map
.into_iter()
.map(|(language, bytes)| RepoLanguage {
source: Source::Gitea,
repo: repo.clone(),
language,
bytes,
color: None, // Gitea doesn't return colors
})
.collect();
total += self.writer.upsert_repo_languages(&languages).await?;
}
debug!(total, repos = repos.len(), "gitea repo languages updated");
Ok(total)
}
}
#[async_trait]
impl EventSource for GiteaSource {
fn name(&self) -> &'static str {
SOURCE_NAME
}
async fn poll(&self) -> Result<usize, SourceError> {
let mut all_repos = HashSet::new();
// Poll user's own activity feed (existing behavior).
let user_url = self.user_feed_base_url();
let (mut total, repos) = self.poll_feed(SOURCE_NAME, &user_url, false).await?;
all_repos.extend(repos);
// Discover orgs and poll each org's activity feed, filtering for
// events performed by this user.
let orgs = self.discover_orgs().await?;
for org in &orgs {
let state_key = format!("gitea:org:{org}");
let org_url = self.org_feed_base_url(org);
match self.poll_feed(&state_key, &org_url, true).await {
Ok((n, repos)) => {
total += n;
all_repos.extend(repos);
}
Err(e) => {
tracing::warn!(org = %org, error = %e, "failed to poll org feed");
}
}
}
if let Err(e) = self.fetch_languages(&all_repos).await {
tracing::warn!(error = %e, "gitea language fetch failed; continuing");
}
debug!(ingested = total, orgs = orgs.len(), "gitea poll complete");
Ok(total)
}
}
/// Convert a Gitea activity feed item into our Event row. The host gets
/// stamped into the payload as `_host` so the reshape layer can build
/// web URLs without needing global config.
fn parse_gitea_event(item: &Value, host: &str) -> Option<Event> {
let id = item.get("id").and_then(Value::as_i64)?;
let op_type = item.get("op_type").and_then(Value::as_str)?.to_string();
let created_str = item.get("created").and_then(Value::as_str)?;
let occurred_at = DateTime::parse_from_rfc3339(created_str)
.ok()?
.with_timezone(&Utc);
let private = item.get("is_private").and_then(Value::as_bool).unwrap_or(false);
let mut payload = item.clone();
if let Some(obj) = payload.as_object_mut() {
obj.insert("_host".into(), Value::String(host.into()));
}
Some(Event {
id: format!("gitea:{id}"),
source: Source::Gitea,
action: op_type,
occurred_at,
public: !private,
payload,
})
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn parse_commit_repo() {
let raw = json!({
"id": 973,
"op_type": "commit_repo",
"ref_name": "refs/heads/main",
"is_private": false,
"content": "{\"Commits\":[{\"Sha1\":\"abc123\"}],\"Len\":1}",
"created": "2026-05-03T16:37:45Z",
"repo": { "full_name": "grenade/moments" }
});
let ev = parse_gitea_event(&raw, "git.lair.cafe").expect("parses");
assert_eq!(ev.id, "gitea:973");
assert_eq!(ev.source, Source::Gitea);
assert_eq!(ev.action, "commit_repo");
assert!(ev.public);
// host stamped into payload
assert_eq!(
ev.payload.get("_host").and_then(|v| v.as_str()),
Some("git.lair.cafe")
);
}
#[test]
fn org_event_user_filter_predicate() {
let by_user = json!({
"id": 500, "op_type": "commit_repo", "is_private": false,
"created": "2026-05-03T10:00:00Z",
"act_user": { "login": "grenade" },
"repo": { "full_name": "myorg/somerepo" }
});
let by_other = json!({
"id": 501, "op_type": "commit_repo", "is_private": false,
"created": "2026-05-03T10:01:00Z",
"act_user": { "login": "otherperson" },
"repo": { "full_name": "myorg/somerepo" }
});
// Both parse as valid events
assert!(parse_gitea_event(&by_user, "git.lair.cafe").is_some());
assert!(parse_gitea_event(&by_other, "git.lair.cafe").is_some());
// The user-filter predicate used by poll_feed
let is_user = |item: &Value, user: &str| -> bool {
item.get("act_user")
.and_then(|u| u.get("login"))
.and_then(Value::as_str)
.map(|login| login.eq_ignore_ascii_case(user))
.unwrap_or(false)
};
assert!(is_user(&by_user, "grenade"));
assert!(!is_user(&by_other, "grenade"));
// Case-insensitive match
assert!(is_user(&by_user, "Grenade"));
}
#[test]
fn private_event_marked_private() {
let raw = json!({
"id": 100,
"op_type": "commit_repo",
"is_private": true,
"created": "2026-05-03T00:00:00Z",
"repo": { "full_name": "grenade/private" }
});
let ev = parse_gitea_event(&raw, "git.lair.cafe").expect("parses");
assert!(!ev.public);
}
}