#!/usr/bin/env python3
"""Download all ToxicDocs 'Frawley' PDFs from the manifest + extract text sidecars."""
import json, os, re, subprocess, time

BASE = "/mnt/data/Frawley/toxicdocs"
manifest = json.load(open(f"{BASE}/_manifest.json"))

def safe(s, n=18):
    s = re.sub(r"[^A-Za-z0-9]+", "", (s or "")) or "na"
    return s[:n]

order = {"yes": "1yes", "likely": "2likely", "unclear": "3unclear", "no": "4no"}
ok = miss = txt_ok = 0
rows = []
for i, d in enumerate(manifest):
    url = d.get("cdnUrl")
    h = d.get("hash") or d.get("id")
    if not url or not h:
        miss += 1; continue
    cat = order.get(d.get("ourFrawley", "unclear"), "3unclear")
    yr = (d.get("year") or "xxxx").strip() or "xxxx"
    firm = safe(d.get("firm"), 14)
    sub = safe(d.get("substance"), 10)
    fn = f"{cat}__{yr}__{firm}__{sub}__{h}.pdf"
    path = f"{BASE}/{fn}"
    if not os.path.exists(path) or os.path.getsize(path) < 200:
        r = subprocess.run(["curl", "-sSL", "--max-time", "40", "--retry", "2",
                            "-o", path, url], capture_output=True)
        time.sleep(0.15)
    if os.path.exists(path) and os.path.getsize(path) > 200:
        ok += 1
        # extract text sidecar
        txt = path[:-4] + ".txt"
        try:
            t = subprocess.run(["pdftotext", "-q", path, txt], capture_output=True, timeout=60)
            if os.path.exists(txt) and os.path.getsize(txt) > 5:
                txt_ok += 1
        except Exception:
            pass
        rows.append((fn, d))
    else:
        miss += 1
    if (i + 1) % 25 == 0:
        print(f"  {i+1}/{len(manifest)}  ok={ok} miss={miss} txt={txt_ok}", flush=True)

print(f"DONE: downloaded {ok}/{len(manifest)} PDFs, {txt_ok} with extractable text, {miss} missing.")

# write a readable index sorted by triage
rows.sort(key=lambda x: x[0])
with open(f"{BASE}/_INDEX.md", "w") as f:
    f.write("# ToxicDocs 'Frawley' set — local copy (251 documents)\n\n")
    f.write("Downloaded from toxicdocs.org `/api/search?q=Frawley`. Filenames sort by relevance: "
            "`1yes` = confirmed John P. Frawley (Hercules); `2likely`; `3unclear`; `4no`. "
            "Each `.pdf` has a `.txt` sidecar (pdftotext). `ourFrawley`/`context` from the OCR-snippet triage.\n\n")
    for cat in ("1yes", "2likely", "3unclear", "4no"):
        sub = [r for r in rows if r[0].startswith(cat)]
        f.write(f"\n## {cat}  ({len(sub)})\n\n")
        for fn, d in sub:
            f.write(f"- `{fn}` — {d.get('year','')} {d.get('firm','')} / {d.get('substance','')} — "
                    f"{(d.get('context','') or '').strip()}\n")
print("wrote _INDEX.md")
