fix: weight language graph by repo language proportions

Each commit was counted once per language in the repo regardless of
that language's share, so Shell (present in many repos as small
deploy scripts) appeared larger than Rust. Now weights each commit
by the language's byte proportion in the repo (e.g. a commit to a
95% Rust / 5% Shell repo contributes 0.95 to Rust, 0.05 to Shell).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-06 06:59:47 +03:00
parent ee93429317
commit 14643273c0

View File

@@ -229,36 +229,46 @@ impl EventReader for PgStore {
async fn language_daily_counts(&self, from: NaiveDate, to: NaiveDate) -> Result<Vec<LanguageDailyCount>, StoreError> {
let rows = sqlx::query(
r#"
SELECT d::date AS date,
rl.language,
COALESCE(MAX(rl.color),
(SELECT color FROM repo_languages
WHERE language = rl.language AND color IS NOT NULL
LIMIT 1)
) AS color,
COUNT(e.id)::bigint AS commits
FROM generate_series($1::date, $2::date, '1 day') d
JOIN events e
ON e.occurred_at >= (d::date || 'T00:00:00Z')::timestamptz
AND e.occurred_at < ((d::date + 1) || 'T00:00:00Z')::timestamptz
AND e.public = true
AND e.action IN ('Commit', 'PushEvent', 'commit_repo')
JOIN repo_languages rl
ON rl.source = e.source
AND rl.repo = CASE e.source
WHEN 'github' THEN COALESCE(
e.payload->'repo'->>'name',
e.payload->'repository'->>'full_name',
e.payload->>'_repo'
)
WHEN 'gitea' THEN COALESCE(
e.payload->'repo'->>'full_name',
e.payload->'repo'->>'name'
)
ELSE NULL
END
GROUP BY d::date, rl.language
ORDER BY d::date, commits DESC
SELECT date, language, color,
ROUND(SUM(weight))::bigint AS commits
FROM (
SELECT d::date AS date,
rl.language,
COALESCE(rl.color,
(SELECT color FROM repo_languages
WHERE language = rl.language AND color IS NOT NULL
LIMIT 1)
) AS color,
rl.bytes::float / NULLIF(rt.total, 0) AS weight
FROM generate_series($1::date, $2::date, '1 day') d
JOIN events e
ON e.occurred_at >= (d::date || 'T00:00:00Z')::timestamptz
AND e.occurred_at < ((d::date + 1) || 'T00:00:00Z')::timestamptz
AND e.public = true
AND e.action IN ('Commit', 'PushEvent', 'commit_repo')
JOIN repo_languages rl
ON rl.source = e.source
AND rl.repo = CASE e.source
WHEN 'github' THEN COALESCE(
e.payload->'repo'->>'name',
e.payload->'repository'->>'full_name',
e.payload->>'_repo'
)
WHEN 'gitea' THEN COALESCE(
e.payload->'repo'->>'full_name',
e.payload->'repo'->>'name'
)
ELSE NULL
END
JOIN LATERAL (
SELECT SUM(bytes)::float AS total
FROM repo_languages r2
WHERE r2.source = rl.source AND r2.repo = rl.repo
) rt ON true
) weighted
GROUP BY date, language, color
HAVING ROUND(SUM(weight)) > 0
ORDER BY date, commits DESC
"#,
)
.bind(from)