#!/usr/bin/env python3
"""Index the contents of dropbox/ into a parquet+csv summary for research portal."""

from __future__ import annotations

import argparse
import re
from pathlib import Path
from typing import Iterable

import pandas as pd

DEFAULT_EXTENSIONS = {
    ".pdf",
    ".doc",
    ".docx",
    ".rtf",
    ".txt",
    ".md",
    ".ppt",
    ".pptx",
    ".xls",
    ".xlsx",
    ".ods",
    ".csv",
    ".tsv",
    ".json",
}

YEAR_PATTERN = re.compile(r"(19|20)\d{2}")


def detect_year(parts: Iterable[str]) -> str | None:
    for part in parts:
        for token in re.split(r"[\s_\-\.]+", part):
            if YEAR_PATTERN.fullmatch(token):
                return token
    return None


def build_index(root: Path, exts: set[str]) -> pd.DataFrame:
    rows: list[dict[str, object]] = []
    for path in root.rglob("*"):
        rel = path.relative_to(root)
        if rel.parts and rel.parts[0].startswith("."):
            continue
        info = {
            "relative_path": str(rel),
            "depth": len(rel.parts),
            "is_file": path.is_file(),
            "size": path.stat().st_size if path.is_file() else None,
            "top_level": rel.parts[0] if rel.parts else "",
            "extension": path.suffix.lower() if path.is_file() else "",
        }
        if info["is_file"] and info["extension"] not in exts:
            continue
        year = detect_year(rel.parts)
        if year is None and info["is_file"]:
            year = detect_year([path.stem])
        info["year"] = year
        rows.append(info)
    return pd.DataFrame(rows)


def main() -> None:
    parser = argparse.ArgumentParser(description="Index the dropbox directory")
    parser.add_argument("--root", default="dropbox", help="Path to dropbox root")
    parser.add_argument(
        "--output",
        default="docs/ep_projects_index.parquet",
        help="Output parquet file",
    )
    parser.add_argument(
        "--csv",
        default="docs/ep_projects_index.csv",
        help="Optional CSV mirror",
    )
    args = parser.parse_args()

    root = Path(args.root).resolve()
    if not root.exists():
        raise SystemExit(f"Root directory {root} does not exist")

    df = build_index(root, DEFAULT_EXTENSIONS)
    df.sort_values(["year", "top_level", "relative_path"], inplace=True, na_position="last")
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(output_path, index=False)
    if args.csv:
        Path(args.csv).parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(args.csv, index=False)

    print(f"Indexed {len(df)} items from {root}")


if __name__ == "__main__":
    main()
