fix: weight language graph by repo language proportions

Each commit was counted once per language in the repo regardless of
that language's share, so Shell (present in many repos as small
deploy scripts) appeared larger than Rust. Now weights each commit
by the language's byte proportion in the repo (e.g. a commit to a
95% Rust / 5% Shell repo contributes 0.95 to Rust, 0.05 to Shell).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-06 06:59:47 +03:00
parent ee93429317
commit 14643273c0

View File

@@ -229,36 +229,46 @@ impl EventReader for PgStore {
async fn language_daily_counts(&self, from: NaiveDate, to: NaiveDate) -> Result<Vec<LanguageDailyCount>, StoreError> { async fn language_daily_counts(&self, from: NaiveDate, to: NaiveDate) -> Result<Vec<LanguageDailyCount>, StoreError> {
let rows = sqlx::query( let rows = sqlx::query(
r#" r#"
SELECT d::date AS date, SELECT date, language, color,
rl.language, ROUND(SUM(weight))::bigint AS commits
COALESCE(MAX(rl.color), FROM (
(SELECT color FROM repo_languages SELECT d::date AS date,
WHERE language = rl.language AND color IS NOT NULL rl.language,
LIMIT 1) COALESCE(rl.color,
) AS color, (SELECT color FROM repo_languages
COUNT(e.id)::bigint AS commits WHERE language = rl.language AND color IS NOT NULL
FROM generate_series($1::date, $2::date, '1 day') d LIMIT 1)
JOIN events e ) AS color,
ON e.occurred_at >= (d::date || 'T00:00:00Z')::timestamptz rl.bytes::float / NULLIF(rt.total, 0) AS weight
AND e.occurred_at < ((d::date + 1) || 'T00:00:00Z')::timestamptz FROM generate_series($1::date, $2::date, '1 day') d
AND e.public = true JOIN events e
AND e.action IN ('Commit', 'PushEvent', 'commit_repo') ON e.occurred_at >= (d::date || 'T00:00:00Z')::timestamptz
JOIN repo_languages rl AND e.occurred_at < ((d::date + 1) || 'T00:00:00Z')::timestamptz
ON rl.source = e.source AND e.public = true
AND rl.repo = CASE e.source AND e.action IN ('Commit', 'PushEvent', 'commit_repo')
WHEN 'github' THEN COALESCE( JOIN repo_languages rl
e.payload->'repo'->>'name', ON rl.source = e.source
e.payload->'repository'->>'full_name', AND rl.repo = CASE e.source
e.payload->>'_repo' WHEN 'github' THEN COALESCE(
) e.payload->'repo'->>'name',
WHEN 'gitea' THEN COALESCE( e.payload->'repository'->>'full_name',
e.payload->'repo'->>'full_name', e.payload->>'_repo'
e.payload->'repo'->>'name' )
) WHEN 'gitea' THEN COALESCE(
ELSE NULL e.payload->'repo'->>'full_name',
END e.payload->'repo'->>'name'
GROUP BY d::date, rl.language )
ORDER BY d::date, commits DESC ELSE NULL
END
JOIN LATERAL (
SELECT SUM(bytes)::float AS total
FROM repo_languages r2
WHERE r2.source = rl.source AND r2.repo = rl.repo
) rt ON true
) weighted
GROUP BY date, language, color
HAVING ROUND(SUM(weight)) > 0
ORDER BY date, commits DESC
"#, "#,
) )
.bind(from) .bind(from)