#!/usr/bin/env python3
import argparse
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path

import psycopg

ROOT = Path(__file__).resolve().parent.parent
FILES_ROOT = (ROOT / "data" / "raw").resolve()


def norm_dsn(dsn: str) -> str:
    # Convert async SQLAlchemy DSN to psycopg sync
    return dsn.replace("postgresql+asyncpg://", "postgresql://")


def guess_year_month(name: str) -> tuple[int | None, int | None]:
    # try YYYYMM or YYYY_MM etc.
    m = re.search(r"(20\d{2})[_-]?(0[1-9]|1[0-2])", name)
    if m:
        return int(m.group(1)), int(m.group(2))
    m = re.search(r"(20\d{2})", name)
    return (int(m.group(1)), None) if m else (None, None)


def infer_institution(path: Path, dataset: str) -> str | None:
    parts = path.parts
    try:
        i = parts.index(dataset)
        if i + 1 < len(parts):
            nxt = parts[i + 1]
            if nxt != path.name:
                return nxt
    except ValueError:
        pass
    return None


def main():
    ap = argparse.ArgumentParser(description="Index files under data/raw into meta.file_index")
    ap.add_argument("--root", default=str(FILES_ROOT), help="Root directory (default: data/raw)")
    ap.add_argument(
        "--skip",
        default=os.getenv("DOWNLOADS_INDEX_SKIP_DIRS", "leychile/xml"),
        help="Comma-separated subpaths to skip (relative to root)",
    )
    ap.add_argument("--limit", type=int, default=int(os.getenv("INDEX_MAX_FILES", "500000")), help="Max files to index")
    ap.add_argument("--timeout", type=float, default=float(os.getenv("INDEX_TIMEOUT", "600")), help="Time budget seconds")
    ap.add_argument("--dsn", default=norm_dsn(os.getenv("DATABASE_URL", "")), help="Postgres DSN (sync psycopg)")
    args = ap.parse_args()

    root = Path(args.root).resolve()
    if not root.exists():
        print(f"Root not found: {root}", file=sys.stderr)
        return 1
    # Normalize DSN even if provided via --dsn (accept SQLAlchemy-style +asyncpg)
    dsn = norm_dsn(args.dsn)
    if not dsn:
        print("Missing DSN (DATABASE_URL)", file=sys.stderr)
        return 2

    skip_abs = set()
    for p in args.skip.split(",") if args.skip else []:
        if p.strip():
            skip_abs.add(str((root / p.strip()).resolve()))

    t0 = time.time()
    rows: list[tuple] = []
    scanned = 0
    for ds_dir in root.iterdir():
        if not ds_dir.is_dir():
            continue
        dataset = ds_dir.name.lower()
        for r, dirs, files in os.walk(ds_dir, topdown=True):
            rp = str(Path(r).resolve())
            if any(rp.startswith(s) for s in skip_abs):
                dirs[:] = []
                continue
            # filter dot dirs
            dirs[:] = [d for d in dirs if not d.startswith(".")]
            for name in files:
                if name.startswith("."):
                    continue
                p = Path(r) / name
                if not p.is_file():
                    continue
                rel = str(p.resolve().relative_to(root))
                ext = (p.suffix.lstrip(".") or "file").lower()
                size = p.stat().st_size
                y, m = guess_year_month(p.name)
                inst = infer_institution(p.resolve(), dataset)
                kind = "manual" if re.search(r"manual", p.name, re.I) else ext
                try:
                    mt = datetime.fromtimestamp(p.stat().st_mtime, tz=timezone.utc)
                except Exception:
                    mt = None
                rows.append((rel, dataset, name, ext, size, y, m, inst, kind, mt))
                scanned += 1
                if scanned >= args.limit or (time.time() - t0) > args.timeout:
                    break
            if scanned >= args.limit or (time.time() - t0) > args.timeout:
                break
        if scanned >= args.limit or (time.time() - t0) > args.timeout:
            break

    if not rows:
        print("No files indexed (check root/skip)")
        return 0

    ddl = (ROOT / "sql" / "meta_file_index.sql").read_text()
    with psycopg.connect(dsn) as con:
        with con.cursor() as cur:
            cur.execute(ddl)
            # Upsert batch without PREPARE to avoid param type inference issues
            sql = (
                "INSERT INTO meta.file_index(rel_path,dataset,name,ext,size,year,month,institution,kind,mtime) "
                "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) "
                "ON CONFLICT (rel_path) DO UPDATE SET "
                "dataset=EXCLUDED.dataset, name=EXCLUDED.name, ext=EXCLUDED.ext, size=EXCLUDED.size, "
                "year=EXCLUDED.year, month=EXCLUDED.month, institution=EXCLUDED.institution, "
                "kind=EXCLUDED.kind, mtime=EXCLUDED.mtime, added_at=now()"
            )
            cur.executemany(sql, rows)
        con.commit()

    print(f"Indexed {len(rows)} files (scanned={scanned}) in {time.time()-t0:.1f}s")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
