#!/usr/bin/env python3
"""Validate the anti-amyloid source ingest catalog."""

from __future__ import annotations

import argparse
import json
import re
import sys
import tomllib
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from typing import Any


NCT_RE = re.compile(r"^NCT[0-9]{8}$")
HTTPS_RE = re.compile(r"^https://")
ARTIFACT_RE = re.compile(r"^va_[a-f0-9]{16}$")
FINDING_RE = re.compile(r"^vf_[a-f0-9]{16}$")

MIN_TOTAL_ENTRIES = 36
MIN_COUNTS = {
    "clinical_trial_registry": 20,
    "regulatory": 8,
    "dataset_or_registry": 8,
    "code_or_tool": 3,
    "literature_or_table": 4,
}
MIN_PRIORITIES = {
    "P0": 12,
    "P1": 12,
}


def parse_instant(value: str) -> datetime:
    normalized = value.replace("Z", "+00:00")
    parsed = datetime.fromisoformat(normalized)
    if parsed.tzinfo is None:
        parsed = parsed.replace(tzinfo=timezone.utc)
    return parsed.astimezone(timezone.utc)


def load_frontier_id(frontier: Path, issues: list[dict[str, str]]) -> str:
    config_path = frontier / ".vela" / "config.toml"
    if not config_path.is_file():
        issues.append({"id": "frontier", "field": "config", "message": "missing .vela/config.toml"})
        return ""
    with config_path.open("rb") as handle:
        config = tomllib.load(handle)
    frontier_id = config.get("project", {}).get("frontier_id", "")
    if not isinstance(frontier_id, str):
        return ""
    return frontier_id


def validate_locator(entry_id: str, locator: Any, issues: list[dict[str, str]]) -> None:
    if not isinstance(locator, str) or not locator.strip():
        issues.append({"id": entry_id, "field": "locator", "message": "locator is required"})
        return
    if not (HTTPS_RE.match(locator) or locator.startswith("doi:") or NCT_RE.match(locator)):
        issues.append({"id": entry_id, "field": "locator", "message": "locator must be https, doi:, or NCT id"})


def validate_catalog(frontier: Path, max_age_days: int) -> dict[str, Any]:
    issues: list[dict[str, str]] = []
    frontier_id = load_frontier_id(frontier, issues)
    catalog_path = frontier / "ingest" / "source-ingest-plan.v1.json"
    artifact_ids = {path.stem for path in (frontier / ".vela" / "artifacts").glob("*.json")}
    finding_ids = {path.stem for path in (frontier / ".vela" / "findings").glob("*.json")}

    if not catalog_path.is_file():
        return {
            "ok": False,
            "command": "check-anti-amyloid-ingest-catalog",
            "frontier": str(frontier),
            "frontier_id": frontier_id,
            "entry_count": 0,
            "issue_count": 1,
            "counts": {},
            "priority_counts": {},
            "verified_at": None,
            "issues": [{"id": "ingest-catalog", "field": "file", "message": "missing ingest/source-ingest-plan.v1.json"}],
        }

    try:
        catalog = json.loads(catalog_path.read_text())
    except json.JSONDecodeError as exc:
        return {
            "ok": False,
            "command": "check-anti-amyloid-ingest-catalog",
            "frontier": str(frontier),
            "frontier_id": frontier_id,
            "entry_count": 0,
            "issue_count": 1,
            "counts": {},
            "priority_counts": {},
            "verified_at": None,
            "issues": [{"id": "ingest-catalog", "field": "json", "message": str(exc)}],
        }

    if catalog.get("schema") != "vela.source-ingest-plan.v1":
        issues.append({"id": "ingest-catalog", "field": "schema", "message": "expected vela.source-ingest-plan.v1"})
    if frontier_id and catalog.get("frontier_id") != frontier_id:
        issues.append({"id": "ingest-catalog", "field": "frontier_id", "message": "frontier id does not match .vela/config.toml"})

    verified_at = catalog.get("verified_at")
    verified_age_days = None
    if not isinstance(verified_at, str) or not verified_at:
        issues.append({"id": "ingest-catalog", "field": "verified_at", "message": "verified_at is required"})
    else:
        try:
            verified = parse_instant(verified_at)
            verified_age_days = (datetime.now(timezone.utc) - verified).total_seconds() / 86400
            if verified_age_days < -1:
                issues.append({"id": "ingest-catalog", "field": "verified_at", "message": "verified_at is in the future"})
            if verified_age_days > max_age_days:
                issues.append({"id": "ingest-catalog", "field": "verified_at", "message": f"catalog verification is older than {max_age_days} days"})
        except ValueError as exc:
            issues.append({"id": "ingest-catalog", "field": "verified_at", "message": f"invalid timestamp: {exc}"})

    entries = catalog.get("entries", [])
    if not isinstance(entries, list):
        issues.append({"id": "ingest-catalog", "field": "entries", "message": "entries must be a list"})
        entries = []
    if len(entries) < MIN_TOTAL_ENTRIES:
        issues.append({"id": "ingest-catalog", "field": "entries", "message": f"expected at least {MIN_TOTAL_ENTRIES} entries"})

    seen_ids: set[str] = set()
    category_counts: Counter[str] = Counter()
    priority_counts: Counter[str] = Counter()
    status_counts: Counter[str] = Counter()
    representation_counts: Counter[str] = Counter()

    required = (
        "id",
        "name",
        "category",
        "priority",
        "representation",
        "source_type",
        "locator",
        "ingest_status",
        "access_terms",
        "license_note",
        "target_use",
    )
    allowed_status = {"ingested", "pointer_only", "candidate", "excluded"}
    allowed_priority = {"P0", "P1", "P2"}

    for entry in entries:
        if not isinstance(entry, dict):
            issues.append({"id": "entry", "field": "entry", "message": "entry must be an object"})
            continue

        entry_id = entry.get("id")
        if not isinstance(entry_id, str) or not entry_id.strip():
            entry_id = "entry"
            issues.append({"id": entry_id, "field": "id", "message": "id is required"})
        elif entry_id in seen_ids:
            issues.append({"id": entry_id, "field": "id", "message": "duplicate id"})
        else:
            seen_ids.add(entry_id)

        for field in required:
            if not isinstance(entry.get(field), str) or not entry.get(field, "").strip():
                issues.append({"id": entry_id, "field": field, "message": f"{field} is required"})

        validate_locator(entry_id, entry.get("locator"), issues)

        category = entry.get("category")
        if isinstance(category, str):
            category_counts[category] += 1
        priority = entry.get("priority")
        if isinstance(priority, str):
            priority_counts[priority] += 1
            if priority not in allowed_priority:
                issues.append({"id": entry_id, "field": "priority", "message": "priority must be P0, P1, or P2"})
        status = entry.get("ingest_status")
        if isinstance(status, str):
            status_counts[status] += 1
            if status not in allowed_status:
                issues.append({"id": entry_id, "field": "ingest_status", "message": "unknown ingest status"})
        representation = entry.get("representation")
        if isinstance(representation, str):
            representation_counts[representation] += 1

        artifact_id = entry.get("current_frontier_artifact_id")
        if status == "ingested":
            if not isinstance(artifact_id, str) or not ARTIFACT_RE.match(artifact_id):
                issues.append({"id": entry_id, "field": "current_frontier_artifact_id", "message": "ingested entries must name a va_* artifact"})
            elif artifact_id not in artifact_ids:
                issues.append({"id": entry_id, "field": "current_frontier_artifact_id", "message": f"unknown artifact id: {artifact_id}"})
        elif artifact_id not in (None, ""):
            issues.append({"id": entry_id, "field": "current_frontier_artifact_id", "message": "only ingested entries may name a frontier artifact"})

        target_findings = entry.get("target_findings")
        if not isinstance(target_findings, list) or not target_findings:
            issues.append({"id": entry_id, "field": "target_findings", "message": "at least one target finding is required"})
        else:
            for finding_id in target_findings:
                if not isinstance(finding_id, str) or not FINDING_RE.match(finding_id):
                    issues.append({"id": entry_id, "field": "target_findings", "message": f"invalid finding id: {finding_id}"})
                elif finding_id not in finding_ids:
                    issues.append({"id": entry_id, "field": "target_findings", "message": f"unknown finding id: {finding_id}"})

    for category, minimum in MIN_COUNTS.items():
        if category_counts[category] < minimum:
            issues.append({"id": "ingest-catalog", "field": "category", "message": f"{category} needs at least {minimum} entries"})
    for priority, minimum in MIN_PRIORITIES.items():
        if priority_counts[priority] < minimum:
            issues.append({"id": "ingest-catalog", "field": "priority", "message": f"{priority} needs at least {minimum} entries"})

    if status_counts["ingested"] < 24:
        issues.append({"id": "ingest-catalog", "field": "ingest_status", "message": "expected at least 24 ingested entries"})

    return {
        "ok": len(issues) == 0,
        "command": "check-anti-amyloid-ingest-catalog",
        "frontier": str(frontier),
        "frontier_id": frontier_id,
        "entry_count": len(entries),
        "issue_count": len(issues),
        "counts": dict(sorted(category_counts.items())),
        "priority_counts": dict(sorted(priority_counts.items())),
        "status_counts": dict(sorted(status_counts.items())),
        "representation_counts": dict(sorted(representation_counts.items())),
        "verified_at": verified_at,
        "verified_age_days": verified_age_days,
        "issues": issues,
    }


def main() -> int:
    parser = argparse.ArgumentParser(description="Validate the anti-amyloid source ingest catalog.")
    parser.add_argument("frontier", type=Path)
    parser.add_argument("--max-age-days", type=int, default=45)
    parser.add_argument("--json", action="store_true", dest="json_output")
    args = parser.parse_args()

    result = validate_catalog(args.frontier.resolve(), args.max_age_days)
    if args.json_output:
        print(json.dumps(result, indent=2, sort_keys=True))
    else:
        status = "ok" if result["ok"] else "failed"
        print(f"anti-amyloid ingest catalog: {status}")
        print(f"entries: {result['entry_count']}")
        for category, count in result["counts"].items():
            print(f"{category}: {count}")
        for issue in result["issues"]:
            print(f"{issue['id']} {issue['field']}: {issue['message']}")
    return 0 if result["ok"] else 1


if __name__ == "__main__":
    sys.exit(main())
