Files
moments/crates/moments-data/src/hg.rs
rob thijssen ee93429317 feat: language stream graph on dashboard
Full-stack feature showing programming languages by commit activity
as a stream graph on the dashboard.

Backend:
- migration: repo_languages table (source, repo, language, bytes, color)
- worker: fetch language breakdowns via GitHub GraphQL (batched,
  20 repos/request) and Gitea REST API during poll cycles
- API: GET /v1/languages/daily (daily commit counts per language),
  GET /v1/languages/repos (all stored repo language data)
- fix timezone bug in daily_counts and language_daily_counts: the
  PostgreSQL server timezone (Europe/Sofia, UTC+3) shifted day
  boundaries, miscounting events near midnight. Now uses explicit
  UTC boundaries in generate_series JOINs.
- use per-source CASE for repo name extraction in language query
  to match gitea payload structure (repo.full_name vs repo.name)
- Gitea languages use GitHub colors via COALESCE fallback

Frontend:
- LanguageStreamGraph component: pure SVG stream graph, weekly
  buckets, centered baseline, top 8 languages + Other, GitHub
  canonical language colors, legend with color dots
- DashPage/ProjectPage: fetch repo languages once via new endpoint
  instead of per-repo forge proxy calls (eliminates 200+ GitHub
  API calls and 403 rate limit errors)
- removed fetchLanguages forge proxy wrapper (dead code)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-06 06:27:59 +03:00

280 lines
9.0 KiB
Rust

//! hg-edge.mozilla.org changeset ingestion via `json-log` revset queries.
//!
//! Uses the `json-log?rev=author(term)` endpoint which returns changesets
//! by the *author* (not the pusher), so it captures commits landed by
//! sheriffs on behalf of the contributor.
//!
//! Repos are discovered within configured groups (e.g. `build`) via the
//! `/{group}/?style=json` index, plus any individually listed repos
//! (e.g. `mozilla-central`). Once the first successful scan completes
//! (poller state is touched), all subsequent polls are skipped — the
//! data is historical and will not change.
use std::sync::Arc;
use async_trait::async_trait;
use chrono::{DateTime, TimeZone, Utc};
use moments_core::{EventSource, EventWriter, PollerStateStore, SourceError};
use moments_entities::{Event, Source};
use reqwest::{Client, header};
use serde_json::Value;
use tracing::{debug, warn};
const SOURCE_NAME: &str = "hg";
const USER_AGENT: &str = concat!(
"moments/",
env!("CARGO_PKG_VERSION"),
" (+https://rob.tn)"
);
/// Maximum changesets returned per json-log request.
const REV_COUNT: u32 = 500;
#[derive(Clone, Debug)]
pub struct HgConfig {
pub host: String,
/// Substrings matched via `author(term)` revset queries.
pub author_terms: Vec<String>,
/// Repo groups to scan — each is enumerated via `/{group}/?style=json`.
pub groups: Vec<String>,
/// Individual repos to scan (e.g. `mozilla-central`).
pub repos: Vec<String>,
}
impl Default for HgConfig {
fn default() -> Self {
Self {
host: "hg-edge.mozilla.org".into(),
author_terms: vec!["rthijssen".into(), "grenade".into()],
groups: vec!["build".into(), "integration".into()],
repos: vec!["mozilla-central".into()],
}
}
}
pub struct HgSource {
client: Client,
writer: Arc<dyn EventWriter>,
state: Arc<dyn PollerStateStore>,
config: HgConfig,
}
impl HgSource {
pub fn new(
client: Client,
writer: Arc<dyn EventWriter>,
state: Arc<dyn PollerStateStore>,
config: HgConfig,
) -> Self {
Self {
client,
writer,
state,
config,
}
}
/// Discover repos in a group via `/{group}/?style=json`.
async fn discover_repos(&self, group: &str) -> Result<Vec<String>, SourceError> {
let url = format!("https://{}/{}/?style=json", self.config.host, group);
let resp = self
.client
.get(&url)
.header(header::USER_AGENT, USER_AGENT)
.send()
.await
.map_err(|e| SourceError::Http(e.to_string()))?;
if !resp.status().is_success() {
warn!(group, status = %resp.status(), "failed to discover repos in group");
return Ok(vec![]);
}
let body: Value = resp
.json()
.await
.map_err(|e| SourceError::Parse(e.to_string()))?;
Ok(body
.get("entries")
.and_then(Value::as_array)
.map(|entries| {
entries
.iter()
.filter_map(|e| {
e.get("name")
.and_then(Value::as_str)
.map(|name| format!("{group}/{name}"))
})
.collect()
})
.unwrap_or_default())
}
fn log_url(&self, repo: &str, author_term: &str) -> String {
format!(
"https://{}/{}/json-log?rev=author({})&style=json&revcount={}",
self.config.host, repo, author_term, REV_COUNT
)
}
async fn scan_repo(&self, repo: &str) -> Result<usize, SourceError> {
let mut all_events = Vec::new();
for term in &self.config.author_terms {
let url = self.log_url(repo, term);
let resp = self
.client
.get(&url)
.header(header::USER_AGENT, USER_AGENT)
.send()
.await
.map_err(|e| SourceError::Http(e.to_string()))?;
if !resp.status().is_success() {
return Err(SourceError::Http(format!("{} GET {}", resp.status(), url)));
}
let body: Value = resp
.json()
.await
.map_err(|e| SourceError::Parse(e.to_string()))?;
if let Some(entries) = body.get("entries").and_then(Value::as_array) {
for entry in entries {
let node = entry.get("node").and_then(Value::as_str).unwrap_or("");
if node.is_empty() {
continue;
}
let occurred_at = entry
.get("date")
.and_then(Value::as_array)
.and_then(|a| parse_hg_date(a))
.unwrap_or_else(Utc::now);
let mut payload = entry.clone();
if let Some(obj) = payload.as_object_mut() {
obj.insert("_repo".into(), Value::String(repo.into()));
obj.insert(
"_host".into(),
Value::String(self.config.host.clone()),
);
}
all_events.push(Event {
id: format!("hg:{repo}:{node}"),
source: Source::Hg,
action: "Commit".into(),
occurred_at,
public: true,
payload,
});
}
}
}
Ok(self.writer.upsert_events(&all_events).await?)
}
}
#[async_trait]
impl EventSource for HgSource {
fn name(&self) -> &'static str {
SOURCE_NAME
}
async fn poll(&self) -> Result<usize, SourceError> {
// hg repos are archived — one complete scan is sufficient.
if self.state.load(SOURCE_NAME).await?.is_some() {
debug!("hg already backfilled, skipping");
return Ok(0);
}
let mut repos: Vec<String> = self.config.repos.clone();
for group in &self.config.groups {
let discovered = self.discover_repos(group).await?;
debug!(group, repos = discovered.len(), "discovered hg repos");
repos.extend(discovered);
}
let mut total = 0usize;
for repo in &repos {
match self.scan_repo(repo).await {
Ok(n) => {
if n > 0 {
debug!(repo, ingested = n, "hg repo scan complete");
}
total += n;
}
Err(e) => {
warn!(repo, error = %e, "hg repo scan failed; continuing");
}
}
}
self.state.touch(SOURCE_NAME).await?;
debug!(ingested = total, "hg backfill complete");
Ok(total)
}
}
/// Parse a hgweb date array `[seconds, tz_offset_secs]` into UTC.
fn parse_hg_date(arr: &[Value]) -> Option<DateTime<Utc>> {
let secs = arr.first()?.as_f64()? as i64;
Utc.timestamp_opt(secs, 0).single()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_hg_date_handles_seconds() {
let arr = vec![Value::from(1_700_000_000_f64), Value::from(0_f64)];
let dt = parse_hg_date(&arr).expect("parses");
assert_eq!(dt.timestamp(), 1_700_000_000);
}
#[test]
fn log_url_uses_revset_author_query() {
let src = HgSource {
client: Client::new(),
writer: Arc::new(NoopWriter),
state: Arc::new(NoopState),
config: HgConfig::default(),
};
let url = src.log_url("mozilla-central", "thijssen");
assert!(url.contains("json-log?rev=author(thijssen)"));
assert!(url.contains("revcount=500"));
}
// Tiny stub impls just so we can construct an HgSource for unit tests.
struct NoopWriter;
#[async_trait]
impl EventWriter for NoopWriter {
async fn upsert_events(
&self,
_events: &[Event],
) -> Result<usize, moments_core::StoreError> {
Ok(0)
}
async fn upsert_repo_languages(
&self,
_languages: &[moments_entities::RepoLanguage],
) -> Result<usize, moments_core::StoreError> {
Ok(0)
}
}
struct NoopState;
#[async_trait]
impl PollerStateStore for NoopState {
async fn load(
&self,
_source: &str,
) -> Result<Option<moments_core::PollerState>, moments_core::StoreError> {
Ok(None)
}
async fn save(
&self,
_source: &str,
_etag: Option<&str>,
_last_modified: Option<DateTime<Utc>>,
) -> Result<(), moments_core::StoreError> {
Ok(())
}
async fn touch(&self, _source: &str) -> Result<(), moments_core::StoreError> {
Ok(())
}
}
}