feat: add analytics subcommand for mbox sender analysis

Adds a new `analytics` subcommand that analyzes Google Takeout mbox files
to identify top senders by message count. Designed for efficient processing
of large files (60GB+) with minimal memory usage.

Features:
- Streams files line-by-line with 1MB buffer (never loads entire file)
- Extracts sender email addresses from From: headers
- Counts messages per sender and displays top N (default 10)
- Shows progress output every 10,000 messages
- No Gmail API access needed

Usage:
  cull-gmail analytics <MBOX_FILE> [-n TOP]

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-03-16 15:12:33 +02:00
parent aee4bc2eaa
commit 285a42a7a3
3 changed files with 176 additions and 1 deletions

View File

@@ -0,0 +1,89 @@
//! # Analytics CLI Module
//!
//! Analyze mbox files to extract sender statistics.
//! Efficiently processes large Google Takeout exports without loading files into memory.
use clap::Parser;
use std::{collections::HashMap, fs::File, io::{BufRead, BufReader}, path::PathBuf};
use cull_gmail::Result;
/// Analyze an mbox file for sender statistics.
///
/// Parses Google Takeout mbox files to count messages by sender.
/// Efficient memory usage: uses streaming line-by-line parsing even for 60GB+ files.
///
/// # Examples
///
/// ```bash
/// # Show top 10 senders (default)
/// cull-gmail analytics ~/takeout/All\ mail\ Including\ Spam\ and\ Trash.mbox
///
/// # Show top 20 senders
/// cull-gmail analytics -n 20 ~/takeout/All\ mail.mbox
/// ```
#[derive(Debug, Parser)]
pub struct AnalyticsCli {
/// Path to mbox file to analyze.
#[arg(value_name = "MBOX_FILE", help = "Path to mbox file to analyze")]
pub mbox_file: PathBuf,
/// Number of top senders to display.
#[arg(
short = 'n',
long = "top",
default_value = "10",
help = "Number of top senders to display"
)]
pub top: usize,
}
impl AnalyticsCli {
pub fn run(&self) -> Result<()> {
let file = File::open(&self.mbox_file)?;
// Use 1MB buffer for efficient sequential reads on large files (e.g. 60GB takeout)
let reader = BufReader::with_capacity(1024 * 1024, file);
let mut counts: HashMap<String, usize> = HashMap::new();
let mut current_from: Option<String> = None;
let mut in_headers = false;
let mut message_count: usize = 0;
for line in reader.lines() {
let line = line?;
if line.starts_with("From ") {
if let Some(from) = current_from.take() {
*counts.entry(from).or_insert(0) += 1;
}
in_headers = true;
message_count += 1;
if message_count % 10_000 == 0 {
eprint!("\r[INFO] Scanned {} messages...", message_count);
}
} else if line.is_empty() {
in_headers = false;
} else if in_headers && line.to_lowercase().starts_with("from:") {
current_from = Some(extract_email(line[5..].trim()));
}
}
if let Some(from) = current_from {
*counts.entry(from).or_insert(0) += 1;
}
eprintln!("\r[INFO] Scanned {} messages total.", message_count);
let mut sorted: Vec<_> = counts.iter().collect();
sorted.sort_by(|a, b| b.1.cmp(a.1));
println!("Top {} senders:", self.top.min(sorted.len()));
for (sender, count) in sorted.iter().take(self.top) {
println!(" {:6} {}", count, sender);
}
Ok(())
}
}
fn extract_email(from: &str) -> String {
if let Some(start) = from.rfind('<') {
if let Some(end) = from[start..].find('>') {
return from[start + 1..start + end].to_lowercase();
}
}
from.to_lowercase()
}

View File

@@ -111,6 +111,7 @@
use clap::{Parser, Subcommand};
mod analytics_cli;
mod init_cli;
mod labels_cli;
mod messages_cli;
@@ -121,6 +122,7 @@ use config::Config;
use cull_gmail::{ClientConfig, EolAction, GmailClient, MessageList, Result, RuleProcessor, Rules};
use std::{env, error::Error as stdError};
use analytics_cli::AnalyticsCli;
use init_cli::InitCli;
use labels_cli::LabelsCli;
use messages_cli::MessagesCli;
@@ -213,6 +215,13 @@ enum SubCmds {
/// environment variables for container deployments and CI/CD pipelines.
#[clap(name = "token", display_order = 4)]
Token(TokenCli),
/// Analyze mbox files for sender statistics.
///
/// Parse Google Takeout mbox exports to identify top senders by message count.
/// Efficient streaming for large files (60GB+) with minimal memory usage.
#[clap(name = "analytics", display_order = 5)]
Analytics(AnalyticsCli),
}
/// CLI application entry point with comprehensive error handling and logging setup.
@@ -298,6 +307,11 @@ async fn run(args: Cli) -> Result<()> {
return init_cli.run().await;
}
// Handle analytics command before loading config: it only reads local mbox files
if let Some(SubCmds::Analytics(analytics_cli)) = args.sub_command {
return analytics_cli.run();
}
// Handle `rules validate` before loading config: it needs no Gmail credentials.
if let Some(SubCmds::Rules(ref rules_cli)) = args.sub_command
&& let Some(result) = rules_cli.run_if_validate()
@@ -327,6 +341,10 @@ async fn run(args: Cli) -> Result<()> {
// This should never be reached due to early return above
unreachable!("Init command should have been handled earlier");
}
SubCmds::Analytics(_) => {
// This should never be reached due to early return above
unreachable!("Analytics command should have been handled earlier");
}
SubCmds::Message(messages_cli) => messages_cli.run(&mut client).await,
SubCmds::Labels(labels_cli) => labels_cli.run(client).await,
SubCmds::Rules(rules_cli) => {