#!/usr/bin/env python3
"""Rulis Threshold-of-Regulation reproduction RESTRICTED to the 1984 CPDB roster.

Rulis (1987) drew his potency distribution from the ORIGINAL 1984 Carcinogenic
Potency Database (Gold et al., EHP 58:9-319) — ~770 test compounds. The NLM file
we hold (CPDBChemical.xls) is the FINAL combined version (~1547 chemicals, lit
through 2001 / NTP through 2004). This script restricts the NLM carcinogen set to
the chemicals that were already in the 1984 edition, then re-runs the same
reproduction, and prints it side by side with the full-database run.

How the 1984 roster is recovered: the Gold 1984 paper's Appendix 1 ("Chemical
names and synonyms") and Appendix 2 ("Chemical names listed by CAS number") list
every chemical in that edition with its CAS number. We OCR-extract those, keep
only CAS numbers that pass the CAS check-digit test (OCR-mangled ones drop out —
conservative), and also keep normalized chemical names. An NLM carcinogen counts
as "in the 1984 edition" if its CAS is in the 1984 CAS set OR its normalized name
matches a 1984 name.

CAVEAT (documented in FINDINGS.md): this restricts the chemical SET to 1984's
scope, but uses the NLM harmonized TD50 VALUES (the 1984 per-chemical TD50 live
only in the OCR-mangled Part IV plot and are not reliably machine-readable). So
this answers "does Rulis's result hold on the 1984-size carcinogen set?", not
"recompute every 1984 TD50".
"""
import re, subprocess, numpy as np, pandas as pd
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
GOLD = ROOT/"papers/lateral/Gold-etal_1984_Carcinogenic-Potency-Database_EHP58_9-319.pdf"
CPDB = ROOT/"papers/cpdb/CPDBChemical.xls"

# ---------------------------------------------------------------- helpers
def num(x):
    if x is None: return np.nan
    s=str(x).strip()
    if not s or s[0] in ".–-—": return np.nan
    m=re.match(r"([0-9]+\.?[0-9]*(?:[eE][-+]?[0-9]+)?)", s)
    return float(m.group(1)) if m else np.nan

def norm(s):
    s=str(s).lower()
    s=re.sub(r"\(.*?\)"," ",s)                 # drop parentheticals/synonyms
    s=re.sub(r"[^a-z0-9]+"," ",s)
    s=re.sub(r"\b\d+\b"," ",s)                 # drop bare numbers/superscripts
    return re.sub(r"\s+"," ",s).strip()

def cas_valid(cas):
    m=re.fullmatch(r"(\d{2,7})-(\d{2})-(\d)", cas)
    if not m: return False
    digits=m.group(1)+m.group(2); chk=int(m.group(3))
    s=sum(int(d)*(i+1) for i,d in enumerate(reversed(digits)))
    return s%10==chk

# ---------------------------------------------------------------- 1984 roster
txt=subprocess.run(["pdftotext","-layout",str(GOLD),"-"],
                   capture_output=True,text=True).stdout.splitlines()
# locate the appendix block (App1 starts "APPENDIX 1", App3 ends our region)
def find(pat):
    for i,l in enumerate(txt):
        if re.search(pat,l): return i
    return -1
a1=find(r"APPENDIX 1:\s*CHEMICAL NAMES AND SYNONYMS")
a3=find(r"APPENDIX 3:\s*STRAIN CODES")
region=txt[a1:a3]
print(f"1984 appendix region: lines {a1}..{a3}  ({len(region)} lines)")

cas84=set(); names84=set()
CASRE=re.compile(r"\b(\d{2,7}-\d{2}-\d)\b")
for ln in region:
    for c in CASRE.findall(ln):
        if cas_valid(c): cas84.add(c)
    # names: strip CAS tokens, split on 2+ spaces, keep alpha-ish fragments
    stripped=CASRE.sub("  ",ln)
    for frag in re.split(r"\s{2,}",stripped):
        n=norm(frag)
        if len(n)>=4 and re.search(r"[a-z]",n): names84.add(n)
print(f"1984 roster: {len(cas84)} check-digit-valid CAS, {len(names84)} normalized name fragments")

# ---------------------------------------------------------------- NLM CPDB carcinogens
rm=pd.read_excel(CPDB,sheet_name="Rats and Mice",engine="calamine",header=None,skiprows=2)
rm=rm.rename(columns={0:"name",1:"cas",2:"salm",3:"td50_rat",4:"td50_mouse"})
rm=rm[["name","cas","salm","td50_rat","td50_mouse"]].dropna(subset=["name"])
rm["td50"]=rm[["td50_rat","td50_mouse"]].apply(
    lambda r:np.nanmin([num(r.td50_rat),num(r.td50_mouse)]),axis=1)
rm["cas"]=rm["cas"].astype(str).str.strip()
rm["nname"]=rm["name"].map(norm)
carc=rm.dropna(subset=["td50"]).copy()                 # positive carcinogens (have TD50)

# membership test: CAS in 1984, or normalized name in 1984, or substring (len>6)
names84_long=[n for n in names84 if len(n)>6]
def in1984(row):
    if row.cas in cas84: return "cas"
    if row.nname in names84: return "name"
    for n in names84_long:
        if len(row.nname)>6 and (row.nname in n or n in row.nname): return "substr"
    return None
carc["src84"]=carc.apply(in1984,axis=1)
sub=carc[carc.src84.notna()].copy()
print(f"\nNLM carcinogens (numeric TD50): {len(carc)}")
print(f"  of these, in 1984 roster: {len(sub)}  "
      f"(by CAS {sum(sub.src84=='cas')}, by name {sum(sub.src84=='name')}, by substr {sum(sub.src84=='substr')})")

# ---------------------------------------------------------------- Rulis reproduction (both sets)
K=5e-5; RISK=1e-6                       # mg/kg/day per ppb dietary ; one-in-a-million lifetime
def report(df,tag):
    df=df.copy()
    df["conc_1e6_ppb"]=RISK*df.td50/(0.5*K)        # ppb giving exactly RISK
    pe=lambda C: float((df.conc_1e6_ppb<C).mean())
    print(f"\n========== {tag}  (N={len(df)} carcinogens) ==========")
    print(f"TD50 mg/kg/day: median={df.td50.median():.3g}  geomean={np.exp(np.log(df.td50).mean()):.3g}  "
          f"min={df.td50.min():.3g}  max={df.td50.max():.3g}")
    for C,lab in [(5,"5 ppb"),(1,"1 ppb (Rulis: ~half either side)"),
                  (0.5,"0.5 ppb (1995 ToR rule)"),(0.05,"0.05 ppb = 50 ppt (Rulis example)")]:
        print(f"  at {lab:<38}: {pe(C)*100:5.1f}% exceed 1e-6  (excluded {100-pe(C)*100:.1f}%)")
    f05=pe(0.5); f005=pe(0.05)
    print(f"  bookkeeping 1-in-5-carcinogen @0.5 ppb : 0.2*{f05:.3f} = {0.2*f05*100:.1f}% of decisions >1e-6 "
          f"(~{100-0.2*f05*100:.0f}/100 <=1e-6)")
    print(f"  bookkeeping 1-in-5-carcinogen @50 ppt  : 0.2*{f005:.3f} = {0.2*f005*100:.1f}% of decisions >1e-6 "
          f"(~{100-0.2*f005*100:.0f}/100 <=1e-6)")
    return df

report(carc,"FULL NLM combined database")
report(sub ,"RESTRICTED to 1984 roster")

sub[["name","cas","td50","salm","src84"]].sort_values("td50").to_csv(
    ROOT/"analysis/cpdb_1984_restricted_carcinogens.csv",index=False)
print(f"\nwrote analysis/cpdb_1984_restricted_carcinogens.csv ({len(sub)} rows)")