skills/minimax-xlsx/scripts/xlsx_reader.py

#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
"""
xlsx_reader.py — Structure discovery and data analysis tool for Excel/CSV files.

Usage:
    python3 xlsx_reader.py <file>                   # full structure report
    python3 xlsx_reader.py <file> --sheet Sales     # analyze one sheet
    python3 xlsx_reader.py <file> --json            # machine-readable output
    python3 xlsx_reader.py <file> --quality         # data quality audit only

Supports: .xlsx, .xlsm, .csv, .tsv
Does NOT modify the source file in any way.

Exit codes:
    0 — success
    1 — file not found / unsupported format / encoding failure
"""

import sys
import json
import argparse
from pathlib import Path


# ---------------------------------------------------------------------------
# Format detection and loading
# ---------------------------------------------------------------------------

def detect_and_load(file_path: str, sheet_name_filter: str | None = None) -> dict:
    """
    Load file into {sheet_name: DataFrame} dict.
    CSV/TSV files are mapped to a single-key dict using the file stem as key.

    Raises ValueError for unsupported formats or encoding failures.
    """
    try:
        import pandas as pd
    except ImportError:
        raise RuntimeError(
            "pandas is not installed. Run: pip install pandas openpyxl"
        )

    path = Path(file_path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    suffix = path.suffix.lower()

    if suffix in (".xlsx", ".xlsm"):
        target = sheet_name_filter if sheet_name_filter else None
        result = pd.read_excel(file_path, sheet_name=target)
        # pd.read_excel with sheet_name=None returns dict; with a name, returns DataFrame
        if isinstance(result, dict):
            return result
        else:
            return {sheet_name_filter: result}

    elif suffix in (".csv", ".tsv"):
        sep = "\t" if suffix == ".tsv" else ","
        encodings = ["utf-8-sig", "gbk", "utf-8", "latin-1"]
        last_error = None
        for enc in encodings:
            try:
                import pandas as pd
                df = pd.read_csv(file_path, sep=sep, encoding=enc)
                df._reader_encoding = enc  # attach metadata (non-standard, for reporting)
                return {path.stem: df}
            except (UnicodeDecodeError, Exception) as e:
                last_error = e
                continue
        raise ValueError(
            f"Cannot decode {file_path}. Tried encodings: {encodings}. "
            f"Last error: {last_error}"
        )

    elif suffix == ".xls":
        raise ValueError(
            ".xls is a legacy binary format not supported by this tool. "
            "Please open the file in Excel and save as .xlsx, then retry."
        )

    else:
        raise ValueError(
            f"Unsupported file format: {suffix}. "
            "Supported formats: .xlsx, .xlsm, .csv, .tsv"
        )


# ---------------------------------------------------------------------------
# Structure discovery
# ---------------------------------------------------------------------------

def explore_structure(sheets: dict) -> dict:
    """
    Return a structured dict describing each sheet.
    Keys: sheet_name -> {shape, columns, dtypes, null_counts, preview}
    """
    result = {}
    for sheet_name, df in sheets.items():
        null_counts = df.isnull().sum()
        null_info = {
            col: {"count": int(cnt), "pct": round(cnt / max(len(df), 1) * 100, 1)}
            for col, cnt in null_counts.items()
            if cnt > 0
        }
        result[sheet_name] = {
            "shape": {"rows": df.shape[0], "cols": df.shape[1]},
            "columns": list(df.columns),
            "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
            "null_columns": null_info,
            "preview": df.head(5).to_dict(orient="records"),
        }
    return result


# ---------------------------------------------------------------------------
# Data quality audit
# ---------------------------------------------------------------------------

def audit_quality(sheets: dict) -> dict:
    """
    Return data quality findings per sheet.
    Checks: nulls, duplicates, mixed-type columns, potential year formatting issues.
    """
    import pandas as pd

    findings = {}
    for sheet_name, df in sheets.items():
        sheet_findings = []

        # Null values
        null_counts = df.isnull().sum()
        for col, cnt in null_counts.items():
            if cnt > 0:
                pct = round(cnt / max(len(df), 1) * 100, 1)
                sheet_findings.append({
                    "type": "null_values",
                    "column": col,
                    "count": int(cnt),
                    "pct": pct,
                    "note": f"Column '{col}' has {cnt} null values ({pct}%). "
                            "If this column contains Excel formulas, null values may "
                            "indicate that the formula cache has not been populated "
                            "(file was never opened in Excel after the formulas were written)."
                })

        # Duplicate rows
        dup_count = int(df.duplicated().sum())
        if dup_count > 0:
            sheet_findings.append({
                "type": "duplicate_rows",
                "count": dup_count,
                "note": f"{dup_count} fully duplicate rows found."
            })

        # Mixed-type object columns (numeric data stored as text)
        for col in df.select_dtypes(include="object").columns:
            numeric_converted = pd.to_numeric(df[col], errors="coerce")
            convertible = int(numeric_converted.notna().sum())
            non_null_total = int(df[col].notna().sum())
            if 0 < convertible < non_null_total:
                sheet_findings.append({
                    "type": "mixed_type",
                    "column": col,
                    "convertible_to_numeric": convertible,
                    "non_convertible": non_null_total - convertible,
                    "note": f"Column '{col}' appears to contain mixed types: "
                            f"{convertible} values can be parsed as numbers, "
                            f"{non_null_total - convertible} cannot. "
                            "Use pd.to_numeric(df[col], errors='coerce') to unify."
                })

        # Year column formatting (e.g., 2024.0 stored as float)
        for col in df.select_dtypes(include="number").columns:
            col_lower = str(col).lower()
            # "年" is the Chinese character for "year" — detect year columns in CJK spreadsheets
            if "year" in col_lower or "yr" in col_lower or "年" in col_lower:
                if df[col].dropna().between(1900, 2200).all():
                    if df[col].dtype == float:
                        sheet_findings.append({
                            "type": "year_as_float",
                            "column": col,
                            "note": f"Column '{col}' appears to be a year column stored as float "
                                    "(e.g., 2024.0). Convert with df[col].astype(int).astype(str) "
                                    "to get clean year strings like '2024'."
                        })

        # Outliers via IQR on numeric columns
        for col in df.select_dtypes(include="number").columns:
            series = df[col].dropna()
            if len(series) < 4:
                continue
            Q1, Q3 = series.quantile(0.25), series.quantile(0.75)
            IQR = Q3 - Q1
            if IQR == 0:
                continue
            outlier_mask = (df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)
            outlier_count = int(outlier_mask.sum())
            if outlier_count > 0:
                sheet_findings.append({
                    "type": "outliers_iqr",
                    "column": col,
                    "count": outlier_count,
                    "note": f"Column '{col}' has {outlier_count} potential outlier(s) "
                            f"(outside 1.5×IQR bounds: [{Q1 - 1.5*IQR:.2f}, {Q3 + 1.5*IQR:.2f}])."
                })

        findings[sheet_name] = sheet_findings

    return findings


# ---------------------------------------------------------------------------
# Summary statistics
# ---------------------------------------------------------------------------

def compute_stats(sheets: dict) -> dict:
    """Compute descriptive statistics for numeric columns per sheet."""
    stats = {}
    for sheet_name, df in sheets.items():
        numeric_df = df.select_dtypes(include="number")
        if numeric_df.empty:
            stats[sheet_name] = {}
            continue
        desc = numeric_df.describe().round(4)
        stats[sheet_name] = desc.to_dict()
    return stats


# ---------------------------------------------------------------------------
# Human-readable report rendering
# ---------------------------------------------------------------------------

def render_report(
    file_path: str,
    structure: dict,
    quality: dict,
    stats: dict,
) -> str:
    lines = []
    p = lines.append

    p("=" * 60)
    p(f"ANALYSIS REPORT: {Path(file_path).name}")
    p("=" * 60)

    # File overview
    sheet_list = list(structure.keys())
    total_rows = sum(s["shape"]["rows"] for s in structure.values())
    p(f"\nSheets ({len(sheet_list)}): {', '.join(sheet_list)}")
    p(f"Total rows across all sheets: {total_rows:,}")

    for sheet_name, info in structure.items():
        p(f"\n{'─' * 50}")
        p(f"Sheet: {sheet_name}")
        p(f"{'─' * 50}")
        p(f"  Size: {info['shape']['rows']:,} rows × {info['shape']['cols']} cols")
        p(f"  Columns: {info['columns']}")

        # Data types
        p("\n  Column types:")
        for col, dtype in info["dtypes"].items():
            p(f"    {col}: {dtype}")

        # Nulls
        if info["null_columns"]:
            p("\n  Null values (columns with nulls only):")
            for col, null_info in info["null_columns"].items():
                p(f"    {col}: {null_info['count']} nulls ({null_info['pct']}%)")
        else:
            p("\n  Null values: none")

        # Stats
        sheet_stats = stats.get(sheet_name, {})
        if sheet_stats:
            p("\n  Numeric column statistics:")
            numeric_cols = list(sheet_stats.keys())
            # Show only first 6 to keep report readable
            for col in numeric_cols[:6]:
                col_stats = sheet_stats[col]
                p(f"    {col}:")
                p(f"      count={col_stats.get('count', 'N/A')}  "
                  f"mean={col_stats.get('mean', 'N/A')}  "
                  f"min={col_stats.get('min', 'N/A')}  "
                  f"max={col_stats.get('max', 'N/A')}")
            if len(numeric_cols) > 6:
                p(f"    ... and {len(numeric_cols) - 6} more numeric columns")

        # Quality findings for this sheet
        sheet_quality = quality.get(sheet_name, [])
        if sheet_quality:
            p(f"\n  Data quality issues ({len(sheet_quality)} found):")
            for finding in sheet_quality:
                p(f"    [{finding['type'].upper()}] {finding['note']}")
        else:
            p("\n  Data quality: no issues found")

        # Preview
        if info["preview"]:
            p("\n  Preview (first 3 rows):")
            import pandas as pd
            preview_df = pd.DataFrame(info["preview"][:3])
            for line in preview_df.to_string(index=False).splitlines():
                p(f"    {line}")

    p("\n" + "=" * 60)
    quality_issue_count = sum(len(v) for v in quality.values())
    if quality_issue_count == 0:
        p("RESULT: No data quality issues detected.")
    else:
        p(f"RESULT: {quality_issue_count} data quality issue(s) found. See details above.")
    p("=" * 60)

    return "\n".join(lines)


# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------

def main() -> None:
    parser = argparse.ArgumentParser(
        description="Read and analyze Excel/CSV files without modifying them."
    )
    parser.add_argument("file", help="Path to .xlsx, .xlsm, .csv, or .tsv file")
    parser.add_argument("--sheet", help="Analyze a specific sheet only", default=None)
    parser.add_argument(
        "--json", action="store_true", help="Output machine-readable JSON"
    )
    parser.add_argument(
        "--quality", action="store_true",
        help="Run data quality audit only (skip stats)"
    )
    args = parser.parse_args()

    try:
        sheets = detect_and_load(args.file, sheet_name_filter=args.sheet)
    except (FileNotFoundError, ValueError, RuntimeError) as e:
        print(f"ERROR: {e}", file=sys.stderr)
        sys.exit(1)

    structure = explore_structure(sheets)
    quality = audit_quality(sheets)
    stats = {} if args.quality else compute_stats(sheets)

    if args.json:
        output = {
            "file": args.file,
            "structure": structure,
            "quality": quality,
            "stats": stats,
        }
        # Convert preview records to serializable form (handle non-JSON types)
        print(json.dumps(output, indent=2, ensure_ascii=False, default=str))
    else:
        report = render_report(args.file, structure, quality, stats)
        print(report)


if __name__ == "__main__":
    main()