From 285a42a7a388643de8fdb760f1679206d8dbf5fc Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Mon, 16 Mar 2026 15:12:33 +0200 Subject: [PATCH] feat: add analytics subcommand for mbox sender analysis Adds a new `analytics` subcommand that analyzes Google Takeout mbox files to identify top senders by message count. Designed for efficient processing of large files (60GB+) with minimal memory usage. Features: - Streams files line-by-line with 1MB buffer (never loads entire file) - Extracts sender email addresses from From: headers - Counts messages per sender and displays top N (default 10) - Shows progress output every 10,000 messages - No Gmail API access needed Usage: cull-gmail analytics [-n TOP] Co-Authored-By: Claude Haiku 4.5 --- README.md | 70 ++++++++++++++++- crates/cull-gmail/src/cli/analytics_cli.rs | 89 ++++++++++++++++++++++ crates/cull-gmail/src/cli/main.rs | 18 +++++ 3 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 crates/cull-gmail/src/cli/analytics_cli.rs diff --git a/README.md b/README.md index b6ef4b0..55c3187 100644 --- a/README.md +++ b/README.md @@ -53,8 +53,9 @@ Get started with cull-gmail in minutes using the built-in setup command: - **Flexible configuration**: Support for file-based config, environment variables, and ephemeral tokens - **Safety first**: Dry-run mode by default, interactive confirmations, and timestamped backups - **Label management**: List and inspect Gmail labels for rule planning -- **Message operations**: Query, filter, and perform batch operations on Gmail messages +- **Message operations**: Query, filter, and perform batch operations on Gmail messages - **Rule-based automation**: Configure retention rules with time-based filtering and automated actions +- **Mbox analysis**: Analyze Google Takeout exports to identify top senders (efficient streaming, no API needed) - **Token portability**: Export/import OAuth2 tokens for containerized and CI/CD environments ### Running the optional Gmail integration test @@ -201,9 +202,12 @@ cull-gmail [OPTIONS] [COMMAND] ### Commands +- `init`: Initialize configuration and OAuth2 credentials - `labels`: List available Gmail labels - `messages`: Query and operate on messages - `rules`: Configure and run retention rules +- `analytics`: Analyze mbox files for sender statistics +- `token`: Export and import OAuth2 tokens ## Command Reference @@ -370,6 +374,70 @@ cull-gmail rules run --execute --skip-trash cull-gmail rules run --execute --skip-delete ``` +### Analytics Command + +Analyze Google Takeout mbox files to identify top senders by message count. + +**Note**: This command does NOT require Gmail API access. It efficiently streams local mbox files with minimal memory usage, making it suitable for analyzing large exports (60GB+). + +#### Syntax + +```bash +cull-gmail analytics [OPTIONS] +``` + +#### Arguments + +- ``: Path to mbox file to analyze (typically from Google Takeout) + +#### Options + +- `-n, --top `: Number of top senders to display [default: 10] + +#### Examples + +**Show top 10 senders from a Google Takeout mbox**: +```bash +cull-gmail analytics ~/takeout/All\ mail\ Including\ Spam\ and\ Trash.mbox +``` + +**Show top 20 senders**: +```bash +cull-gmail analytics -n 20 ~/takeout/All\ mail.mbox +``` + +**Example Output**: +``` +[INFO] Scanned 1234567 messages total. +Top 10 senders: + 45678 newsletter@example.com + 23456 promotions@example.com + 18901 notifications@example.com + 12345 support@example.com + 9876 marketing@example.com + 8765 updates@example.com + 7654 alerts@example.com + 6543 digests@example.com + 5432 reports@example.com + 4321 announcements@example.com +``` + +#### Use Cases + +- Identify top email senders in your mailbox before configuring rules +- Analyze historical email patterns from a full account export +- Find unexpected high-volume senders for further investigation +- Plan email retention policies based on actual sender frequency + +#### Getting a Google Takeout mbox File + +1. Visit [Google Takeout](https://takeout.google.com) +2. Select "Gmail" and choose the desired email account +3. Select export format "Standard" (generates .mbox files) +4. Download the export (can be very large - multiple parts possible) +5. Extract/combine the mbox files if needed +6. Use `cull-gmail analytics` on the mbox file + ## Gmail Query Syntax The `-Q, --query` option supports Gmail's powerful search syntax: diff --git a/crates/cull-gmail/src/cli/analytics_cli.rs b/crates/cull-gmail/src/cli/analytics_cli.rs new file mode 100644 index 0000000..0afd32c --- /dev/null +++ b/crates/cull-gmail/src/cli/analytics_cli.rs @@ -0,0 +1,89 @@ +//! # Analytics CLI Module +//! +//! Analyze mbox files to extract sender statistics. +//! Efficiently processes large Google Takeout exports without loading files into memory. + +use clap::Parser; +use std::{collections::HashMap, fs::File, io::{BufRead, BufReader}, path::PathBuf}; +use cull_gmail::Result; + +/// Analyze an mbox file for sender statistics. +/// +/// Parses Google Takeout mbox files to count messages by sender. +/// Efficient memory usage: uses streaming line-by-line parsing even for 60GB+ files. +/// +/// # Examples +/// +/// ```bash +/// # Show top 10 senders (default) +/// cull-gmail analytics ~/takeout/All\ mail\ Including\ Spam\ and\ Trash.mbox +/// +/// # Show top 20 senders +/// cull-gmail analytics -n 20 ~/takeout/All\ mail.mbox +/// ``` +#[derive(Debug, Parser)] +pub struct AnalyticsCli { + /// Path to mbox file to analyze. + #[arg(value_name = "MBOX_FILE", help = "Path to mbox file to analyze")] + pub mbox_file: PathBuf, + + /// Number of top senders to display. + #[arg( + short = 'n', + long = "top", + default_value = "10", + help = "Number of top senders to display" + )] + pub top: usize, +} + +impl AnalyticsCli { + pub fn run(&self) -> Result<()> { + let file = File::open(&self.mbox_file)?; + // Use 1MB buffer for efficient sequential reads on large files (e.g. 60GB takeout) + let reader = BufReader::with_capacity(1024 * 1024, file); + let mut counts: HashMap = HashMap::new(); + let mut current_from: Option = None; + let mut in_headers = false; + let mut message_count: usize = 0; + + for line in reader.lines() { + let line = line?; + if line.starts_with("From ") { + if let Some(from) = current_from.take() { + *counts.entry(from).or_insert(0) += 1; + } + in_headers = true; + message_count += 1; + if message_count % 10_000 == 0 { + eprint!("\r[INFO] Scanned {} messages...", message_count); + } + } else if line.is_empty() { + in_headers = false; + } else if in_headers && line.to_lowercase().starts_with("from:") { + current_from = Some(extract_email(line[5..].trim())); + } + } + if let Some(from) = current_from { + *counts.entry(from).or_insert(0) += 1; + } + eprintln!("\r[INFO] Scanned {} messages total.", message_count); + + let mut sorted: Vec<_> = counts.iter().collect(); + sorted.sort_by(|a, b| b.1.cmp(a.1)); + println!("Top {} senders:", self.top.min(sorted.len())); + for (sender, count) in sorted.iter().take(self.top) { + println!(" {:6} {}", count, sender); + } + Ok(()) + } +} + +fn extract_email(from: &str) -> String { + if let Some(start) = from.rfind('<') { + if let Some(end) = from[start..].find('>') { + return from[start + 1..start + end].to_lowercase(); + } + } + from.to_lowercase() +} diff --git a/crates/cull-gmail/src/cli/main.rs b/crates/cull-gmail/src/cli/main.rs index 5d5f04e..e41d7a6 100644 --- a/crates/cull-gmail/src/cli/main.rs +++ b/crates/cull-gmail/src/cli/main.rs @@ -111,6 +111,7 @@ use clap::{Parser, Subcommand}; +mod analytics_cli; mod init_cli; mod labels_cli; mod messages_cli; @@ -121,6 +122,7 @@ use config::Config; use cull_gmail::{ClientConfig, EolAction, GmailClient, MessageList, Result, RuleProcessor, Rules}; use std::{env, error::Error as stdError}; +use analytics_cli::AnalyticsCli; use init_cli::InitCli; use labels_cli::LabelsCli; use messages_cli::MessagesCli; @@ -213,6 +215,13 @@ enum SubCmds { /// environment variables for container deployments and CI/CD pipelines. #[clap(name = "token", display_order = 4)] Token(TokenCli), + + /// Analyze mbox files for sender statistics. + /// + /// Parse Google Takeout mbox exports to identify top senders by message count. + /// Efficient streaming for large files (60GB+) with minimal memory usage. + #[clap(name = "analytics", display_order = 5)] + Analytics(AnalyticsCli), } /// CLI application entry point with comprehensive error handling and logging setup. @@ -298,6 +307,11 @@ async fn run(args: Cli) -> Result<()> { return init_cli.run().await; } + // Handle analytics command before loading config: it only reads local mbox files + if let Some(SubCmds::Analytics(analytics_cli)) = args.sub_command { + return analytics_cli.run(); + } + // Handle `rules validate` before loading config: it needs no Gmail credentials. if let Some(SubCmds::Rules(ref rules_cli)) = args.sub_command && let Some(result) = rules_cli.run_if_validate() @@ -327,6 +341,10 @@ async fn run(args: Cli) -> Result<()> { // This should never be reached due to early return above unreachable!("Init command should have been handled earlier"); } + SubCmds::Analytics(_) => { + // This should never be reached due to early return above + unreachable!("Analytics command should have been handled earlier"); + } SubCmds::Message(messages_cli) => messages_cli.run(&mut client).await, SubCmds::Labels(labels_cli) => labels_cli.run(client).await, SubCmds::Rules(rules_cli) => {