Initial commit: add all skills files

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 16:52:49 +08:00
commit 6487becf60
396 changed files with 108871 additions and 0 deletions
--- a/minimax-xlsx/scripts/xlsx_reader.py
+++ b/minimax-xlsx/scripts/xlsx_reader.py
@@ -0,0 +1,362 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+"""
+xlsx_reader.py — Structure discovery and data analysis tool for Excel/CSV files.
+
+Usage:
+    python3 xlsx_reader.py <file>                   # full structure report
+    python3 xlsx_reader.py <file> --sheet Sales     # analyze one sheet
+    python3 xlsx_reader.py <file> --json            # machine-readable output
+    python3 xlsx_reader.py <file> --quality         # data quality audit only
+
+Supports: .xlsx, .xlsm, .csv, .tsv
+Does NOT modify the source file in any way.
+
+Exit codes:
+    0 — success
+    1 — file not found / unsupported format / encoding failure
+"""
+
+import sys
+import json
+import argparse
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# Format detection and loading
+# ---------------------------------------------------------------------------
+
+def detect_and_load(file_path: str, sheet_name_filter: str | None = None) -> dict:
+    """
+    Load file into {sheet_name: DataFrame} dict.
+    CSV/TSV files are mapped to a single-key dict using the file stem as key.
+
+    Raises ValueError for unsupported formats or encoding failures.
+    """
+    try:
+        import pandas as pd
+    except ImportError:
+        raise RuntimeError(
+            "pandas is not installed. Run: pip install pandas openpyxl"
+        )
+
+    path = Path(file_path)
+    if not path.exists():
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    suffix = path.suffix.lower()
+
+    if suffix in (".xlsx", ".xlsm"):
+        target = sheet_name_filter if sheet_name_filter else None
+        result = pd.read_excel(file_path, sheet_name=target)
+        # pd.read_excel with sheet_name=None returns dict; with a name, returns DataFrame
+        if isinstance(result, dict):
+            return result
+        else:
+            return {sheet_name_filter: result}
+
+    elif suffix in (".csv", ".tsv"):
+        sep = "\t" if suffix == ".tsv" else ","
+        encodings = ["utf-8-sig", "gbk", "utf-8", "latin-1"]
+        last_error = None
+        for enc in encodings:
+            try:
+                import pandas as pd
+                df = pd.read_csv(file_path, sep=sep, encoding=enc)
+                df._reader_encoding = enc  # attach metadata (non-standard, for reporting)
+                return {path.stem: df}
+            except (UnicodeDecodeError, Exception) as e:
+                last_error = e
+                continue
+        raise ValueError(
+            f"Cannot decode {file_path}. Tried encodings: {encodings}. "
+            f"Last error: {last_error}"
+        )
+
+    elif suffix == ".xls":
+        raise ValueError(
+            ".xls is a legacy binary format not supported by this tool. "
+            "Please open the file in Excel and save as .xlsx, then retry."
+        )
+
+    else:
+        raise ValueError(
+            f"Unsupported file format: {suffix}. "
+            "Supported formats: .xlsx, .xlsm, .csv, .tsv"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Structure discovery
+# ---------------------------------------------------------------------------
+
+def explore_structure(sheets: dict) -> dict:
+    """
+    Return a structured dict describing each sheet.
+    Keys: sheet_name -> {shape, columns, dtypes, null_counts, preview}
+    """
+    result = {}
+    for sheet_name, df in sheets.items():
+        null_counts = df.isnull().sum()
+        null_info = {
+            col: {"count": int(cnt), "pct": round(cnt / max(len(df), 1) * 100, 1)}
+            for col, cnt in null_counts.items()
+            if cnt > 0
+        }
+        result[sheet_name] = {
+            "shape": {"rows": df.shape[0], "cols": df.shape[1]},
+            "columns": list(df.columns),
+            "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
+            "null_columns": null_info,
+            "preview": df.head(5).to_dict(orient="records"),
+        }
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Data quality audit
+# ---------------------------------------------------------------------------
+
+def audit_quality(sheets: dict) -> dict:
+    """
+    Return data quality findings per sheet.
+    Checks: nulls, duplicates, mixed-type columns, potential year formatting issues.
+    """
+    import pandas as pd
+
+    findings = {}
+    for sheet_name, df in sheets.items():
+        sheet_findings = []
+
+        # Null values
+        null_counts = df.isnull().sum()
+        for col, cnt in null_counts.items():
+            if cnt > 0:
+                pct = round(cnt / max(len(df), 1) * 100, 1)
+                sheet_findings.append({
+                    "type": "null_values",
+                    "column": col,
+                    "count": int(cnt),
+                    "pct": pct,
+                    "note": f"Column '{col}' has {cnt} null values ({pct}%). "
+                            "If this column contains Excel formulas, null values may "
+                            "indicate that the formula cache has not been populated "
+                            "(file was never opened in Excel after the formulas were written)."
+                })
+
+        # Duplicate rows
+        dup_count = int(df.duplicated().sum())
+        if dup_count > 0:
+            sheet_findings.append({
+                "type": "duplicate_rows",
+                "count": dup_count,
+                "note": f"{dup_count} fully duplicate rows found."
+            })
+
+        # Mixed-type object columns (numeric data stored as text)
+        for col in df.select_dtypes(include="object").columns:
+            numeric_converted = pd.to_numeric(df[col], errors="coerce")
+            convertible = int(numeric_converted.notna().sum())
+            non_null_total = int(df[col].notna().sum())
+            if 0 < convertible < non_null_total:
+                sheet_findings.append({
+                    "type": "mixed_type",
+                    "column": col,
+                    "convertible_to_numeric": convertible,
+                    "non_convertible": non_null_total - convertible,
+                    "note": f"Column '{col}' appears to contain mixed types: "
+                            f"{convertible} values can be parsed as numbers, "
+                            f"{non_null_total - convertible} cannot. "
+                            "Use pd.to_numeric(df[col], errors='coerce') to unify."
+                })
+
+        # Year column formatting (e.g., 2024.0 stored as float)
+        for col in df.select_dtypes(include="number").columns:
+            col_lower = str(col).lower()
+            # "年" is the Chinese character for "year" — detect year columns in CJK spreadsheets
+            if "year" in col_lower or "yr" in col_lower or "年" in col_lower:
+                if df[col].dropna().between(1900, 2200).all():
+                    if df[col].dtype == float:
+                        sheet_findings.append({
+                            "type": "year_as_float",
+                            "column": col,
+                            "note": f"Column '{col}' appears to be a year column stored as float "
+                                    "(e.g., 2024.0). Convert with df[col].astype(int).astype(str) "
+                                    "to get clean year strings like '2024'."
+                        })
+
+        # Outliers via IQR on numeric columns
+        for col in df.select_dtypes(include="number").columns:
+            series = df[col].dropna()
+            if len(series) < 4:
+                continue
+            Q1, Q3 = series.quantile(0.25), series.quantile(0.75)
+            IQR = Q3 - Q1
+            if IQR == 0:
+                continue
+            outlier_mask = (df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)
+            outlier_count = int(outlier_mask.sum())
+            if outlier_count > 0:
+                sheet_findings.append({
+                    "type": "outliers_iqr",
+                    "column": col,
+                    "count": outlier_count,
+                    "note": f"Column '{col}' has {outlier_count} potential outlier(s) "
+                            f"(outside 1.5×IQR bounds: [{Q1 - 1.5*IQR:.2f}, {Q3 + 1.5*IQR:.2f}])."
+                })
+
+        findings[sheet_name] = sheet_findings
+
+    return findings
+
+
+# ---------------------------------------------------------------------------
+# Summary statistics
+# ---------------------------------------------------------------------------
+
+def compute_stats(sheets: dict) -> dict:
+    """Compute descriptive statistics for numeric columns per sheet."""
+    stats = {}
+    for sheet_name, df in sheets.items():
+        numeric_df = df.select_dtypes(include="number")
+        if numeric_df.empty:
+            stats[sheet_name] = {}
+            continue
+        desc = numeric_df.describe().round(4)
+        stats[sheet_name] = desc.to_dict()
+    return stats
+
+
+# ---------------------------------------------------------------------------
+# Human-readable report rendering
+# ---------------------------------------------------------------------------
+
+def render_report(
+    file_path: str,
+    structure: dict,
+    quality: dict,
+    stats: dict,
+) -> str:
+    lines = []
+    p = lines.append
+
+    p("=" * 60)
+    p(f"ANALYSIS REPORT: {Path(file_path).name}")
+    p("=" * 60)
+
+    # File overview
+    sheet_list = list(structure.keys())
+    total_rows = sum(s["shape"]["rows"] for s in structure.values())
+    p(f"\nSheets ({len(sheet_list)}): {', '.join(sheet_list)}")
+    p(f"Total rows across all sheets: {total_rows:,}")
+
+    for sheet_name, info in structure.items():
+        p(f"\n{'─' * 50}")
+        p(f"Sheet: {sheet_name}")
+        p(f"{'─' * 50}")
+        p(f"  Size: {info['shape']['rows']:,} rows × {info['shape']['cols']} cols")
+        p(f"  Columns: {info['columns']}")
+
+        # Data types
+        p("\n  Column types:")
+        for col, dtype in info["dtypes"].items():
+            p(f"    {col}: {dtype}")
+
+        # Nulls
+        if info["null_columns"]:
+            p("\n  Null values (columns with nulls only):")
+            for col, null_info in info["null_columns"].items():
+                p(f"    {col}: {null_info['count']} nulls ({null_info['pct']}%)")
+        else:
+            p("\n  Null values: none")
+
+        # Stats
+        sheet_stats = stats.get(sheet_name, {})
+        if sheet_stats:
+            p("\n  Numeric column statistics:")
+            numeric_cols = list(sheet_stats.keys())
+            # Show only first 6 to keep report readable
+            for col in numeric_cols[:6]:
+                col_stats = sheet_stats[col]
+                p(f"    {col}:")
+                p(f"      count={col_stats.get('count', 'N/A')}  "
+                  f"mean={col_stats.get('mean', 'N/A')}  "
+                  f"min={col_stats.get('min', 'N/A')}  "
+                  f"max={col_stats.get('max', 'N/A')}")
+            if len(numeric_cols) > 6:
+                p(f"    ... and {len(numeric_cols) - 6} more numeric columns")
+
+        # Quality findings for this sheet
+        sheet_quality = quality.get(sheet_name, [])
+        if sheet_quality:
+            p(f"\n  Data quality issues ({len(sheet_quality)} found):")
+            for finding in sheet_quality:
+                p(f"    [{finding['type'].upper()}] {finding['note']}")
+        else:
+            p("\n  Data quality: no issues found")
+
+        # Preview
+        if info["preview"]:
+            p("\n  Preview (first 3 rows):")
+            import pandas as pd
+            preview_df = pd.DataFrame(info["preview"][:3])
+            for line in preview_df.to_string(index=False).splitlines():
+                p(f"    {line}")
+
+    p("\n" + "=" * 60)
+    quality_issue_count = sum(len(v) for v in quality.values())
+    if quality_issue_count == 0:
+        p("RESULT: No data quality issues detected.")
+    else:
+        p(f"RESULT: {quality_issue_count} data quality issue(s) found. See details above.")
+    p("=" * 60)
+
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Read and analyze Excel/CSV files without modifying them."
+    )
+    parser.add_argument("file", help="Path to .xlsx, .xlsm, .csv, or .tsv file")
+    parser.add_argument("--sheet", help="Analyze a specific sheet only", default=None)
+    parser.add_argument(
+        "--json", action="store_true", help="Output machine-readable JSON"
+    )
+    parser.add_argument(
+        "--quality", action="store_true",
+        help="Run data quality audit only (skip stats)"
+    )
+    args = parser.parse_args()
+
+    try:
+        sheets = detect_and_load(args.file, sheet_name_filter=args.sheet)
+    except (FileNotFoundError, ValueError, RuntimeError) as e:
+        print(f"ERROR: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    structure = explore_structure(sheets)
+    quality = audit_quality(sheets)
+    stats = {} if args.quality else compute_stats(sheets)
+
+    if args.json:
+        output = {
+            "file": args.file,
+            "structure": structure,
+            "quality": quality,
+            "stats": stats,
+        }
+        # Convert preview records to serializable form (handle non-JSON types)
+        print(json.dumps(output, indent=2, ensure_ascii=False, default=str))
+    else:
+        report = render_report(args.file, structure, quality, stats)
+        print(report)
+
+
+if __name__ == "__main__":
+    main()