#!/usr/bin/env python3
"""Download the key primary sources the dossier relies on (beyond ./papers and ./toxicdocs)."""
import json, os, subprocess, urllib.request

BASE = "/mnt/data/Frawley/sources"
os.makedirs(BASE, exist_ok=True)
status = []

def curl(name, url, timeout=120, headers=None):
    path = f"{BASE}/{name}"
    cmd = ["curl", "-sSL", "--max-time", str(timeout), "--retry", "2", "-A",
           "Mozilla/5.0 (research archive)", "-o", path, "-w", "%{http_code} %{size_download}", url]
    if headers:
        for h in headers: cmd[1:1] = ["-H", h]
    r = subprocess.run(cmd, capture_output=True, text=True)
    out = (r.stdout or "").strip()
    code = out.split()[0] if out else "ERR"
    size = os.path.getsize(path) if os.path.exists(path) else 0
    ok = code == "200" and size > 500
    if not ok and os.path.exists(path) and size < 500:
        os.remove(path)
    status.append((name, "OK" if ok else f"FAIL({code or r.stderr[:40]})", size, url))
    # text sidecar for PDFs
    if ok and name.endswith(".pdf"):
        subprocess.run(["pdftotext", "-q", path, path[:-4] + ".txt"], capture_output=True)
    return ok

def documentcloud(name, docid):
    try:
        with urllib.request.urlopen(f"https://api.www.documentcloud.org/api/documents/{docid}/", timeout=30) as r:
            d = json.load(r)
        slug = d.get("slug"); asset = d.get("asset_url", "https://s3.documentcloud.org/")
        pdf = f"{asset}documents/{docid}/{slug}.pdf"
        txt = f"{asset}documents/{docid}/{slug}.txt"
        curl(f"{name}.pdf", pdf)
        curl(f"{name}.txt", txt)
        open(f"{BASE}/{name}.meta.json", "w").write(json.dumps({"id": docid, "title": d.get("title"), "slug": slug}, indent=1))
    except Exception as e:
        status.append((name, f"FAIL(api:{str(e)[:40]})", 0, docid))

# --- Poison Papers (DocumentCloud) ---
documentcloud("PoisonPapers_B1575_Rowe-to-Frawley_1965-03-19", "3253794")   # crown jewel
documentcloud("PoisonPapers_1A-1-563_Boehringer-Monsanto_dioxin", "3418422")

# --- Strand I: FDA disposition / threshold lineage ---
curl("FDA_Threshold-of-Regulation_60FR36582_1995.pdf", "https://www.govinfo.gov/content/pkg/FR-1995-07-17/pdf/95-17435.pdf")
curl("Oser-Hall_GRAS_Substances_IFT_1972_quotes-NAS-1.0ppm.pdf", "https://www.femaflavor.org/sites/default/files/5.%20GRAS%20Substances%20(3250-3325).pdf")

# --- Strand II: congressional record ---
curl("Senate_2,4,5-T_hearing_1970_witness-index_Frawley-ABSENT.pdf", "https://www.nal.usda.gov/exhibits/speccoll/files/original/f10d95c8bf5c82674a141435ef70ba84.pdf", timeout=180)
curl("CongRec_DailyDigest_D406_1967-08-10_Goddard.pdf", "https://www.congress.gov/crecb/1967/GPO-CRECB-1967-pt29-Pages406-410.pdf")

# --- Court opinions (HTML/text via reader proxies) ---
curl("565_F.Supp.1263_AgentOrange_1983_via-jina.txt", "https://r.jina.ai/https://law.justia.com/cases/federal/district-courts/FSupp/565/1263/1458052/")
curl("516_US_417_Hercules-v-US_1996_CornellLII.html", "https://www.law.cornell.edu/supct/html/94-818.ZO.html")

# --- Blocked from this sandbox (record for operator) ---
for n, why, u in [
    ("Rulis_1987_DeMinimis-Threshold-of-Regulation", "regulations.gov 403 to curl",
     "https://downloads.regulations.gov/EPA-HQ-OPP-2013-0821-0008/content.pdf"),
    ("UCSF_Darby_rxfb0228_1971-Bionetics-Frawley-Hart-to-Upholt", "download.industrydocuments.ucsf.edu DNS-blocked",
     "https://www.industrydocuments.ucsf.edu/docs/rxfb0228"),
    ("UCSF_Darby_yhgd0228_1968-Frawley-to-NAS-Insignificant-Levels", "UCSF DNS-blocked",
     "https://www.industrydocuments.ucsf.edu/docs/yhgd0228"),
    ("UCSF_tobacco_zqxb0104_1994-Frawley-statement-DRAFT", "UCSF DNS-blocked",
     "https://www.industrydocuments.ucsf.edu/docs/zqxb0104"),
    ("NAP_1969_Toxicologically-Insignificant-Levels_monograph", "NAP image-only, login-gated",
     "https://nap.nationalacademies.org/catalog/20376/"),
]:
    status.append((n, f"NEEDS-OPERATOR ({why})", 0, u))

# --- report ---
print(f"{'STATUS':28} {'SIZE':>9}  NAME")
for name, st, size, url in status:
    print(f"{st:28} {size:>9}  {name}")
with open(f"{BASE}/_SOURCES_INDEX.md", "w") as f:
    f.write("# Local archive of key sources (beyond ./papers and ./toxicdocs)\n\n")
    f.write("Downloaded for offline reference. PDFs have `.txt` sidecars. Items marked NEEDS-OPERATOR\n")
    f.write("could not be fetched from this sandbox (DNS/403/login) — URLs given for manual download.\n\n")
    for name, st, size, url in status:
        f.write(f"- **{st}** — `{name}` ({size:,} B) — {url}\n")
print("\nwrote _SOURCES_INDEX.md")
