#!/usr/bin/env python3
"""Cross-reference Frawley's 1967 'safe' compounds against the CPDB carcinogens.
(a) CAS lookup of the compounds named in the essay/dossier (verified);
(b) cleaned automated name-match over the whole 220-compound appendix for scale."""
import re, numpy as np, pandas as pd

def num(x):
    if x is None: return np.nan
    s=str(x).strip()
    if not s or s[0] in ".–-—": return np.nan
    m=re.match(r"([0-9]+\.?[0-9]*(?:[eE][-+]?[0-9]+)?)", s); return float(m.group(1)) if m else np.nan

rm=pd.read_excel("papers/cpdb/CPDBChemical.xls",sheet_name="Rats and Mice",engine="calamine",header=None,skiprows=2)
rm=rm.rename(columns={0:"name",1:"cas",2:"salm",3:"td50_rat",4:"td50_mouse"}).dropna(subset=["name"])
rm["td50"]=rm[["td50_rat","td50_mouse"]].apply(lambda r:np.nanmin([num(r.td50_rat),num(r.td50_mouse)]),axis=1)
rm["cas"]=rm["cas"].astype(str).str.strip()
carc=rm.dropna(subset=["td50"])
cas2=carc.set_index("cas")

# ---------- (a) curated CAS lookup (Frawley ppm + flag) ----------
KEY=[  # name, Frawley 1967 no-effect ppm, Frawley flag, CAS
 ("Vinyl chloride","120,000","safest entry","75-01-4"),
 ("Acrylamide","40","named exception","79-06-1"),
 ("DEHP (phthalate plasticizer)","1,300","packaging case","117-81-7"),
 ("Butylated hydroxyanisole (BHA)","5,000","antioxidant","25013-16-5"),
 ("Catechol","1,250","T = tumours @ higher levels","120-80-9"),
 ("Hydroquinone","10,000","T = tumours @ higher levels","123-31-9"),
 ("Thiourea","—","ref-70 'tumours in rats'","62-56-6"),
 ("Citrus Red No. 2","500","safe colorant","6358-53-8"),
 ("Ponceau 3R","5,000","safe colorant","3564-09-8"),
 ("Sodium cyclamate","10,000","safe sweetener","139-05-9"),
 ("DDT","1","benchmark 'toxic'","50-29-3"),
 ("Dieldrin","0.5","benchmark 'toxic'","60-57-1"),
 ("Aldrin","<0.5","benchmark 'toxic'","309-00-2"),
]
rows=[]
for nm,ppm,flag,cas in KEY:
    if cas in cas2.index:
        r=cas2.loc[cas];
        if isinstance(r,pd.DataFrame): r=r.iloc[0]
        rows.append((nm,ppm,flag,cas,f"{r.td50:.3g}",str(r.salm),"YES"))
    else:
        rows.append((nm,ppm,flag,cas,"—","—","not in CPDB"))
out=pd.DataFrame(rows,columns=["Frawley compound","Frawley no-effect (ppm)","Frawley flag","CAS",
                               "CPDB TD50 (mg/kg/day)","Salmonella","CPDB carcinogen?"])
pd.set_option("display.width",200,"display.max_colwidth",40)
print("=== Frawley's 'safe' compounds, looked up in the CPDB by CAS ===")
print(out.to_string(index=False))
out.to_csv("analysis/frawley_cpdb_keycompounds.csv",index=False)

# ---------- (b) cleaned automated match over the whole appendix ----------
def norm(s):
    s=re.sub(r"\(.*?\)"," ",str(s).lower()); s=re.sub(r"[^a-z0-9]+"," ",s)
    s=re.sub(r"\b\d+\b"," ",s); return re.sub(r"\s+"," ",s).strip()
cpdb_norm={n for n in (norm(x) for x in carc.name) if len(n)>=5}
txt=open("papers/f1967.txt",encoding="utf-8",errors="ignore").read().splitlines()
fraw=set(); grab=False
for ln in txt:
    if "APPENDIX" in ln.upper(): grab=True; continue
    if grab and re.match(r"\s*1\.\s",ln): break
    if not grab: continue
    for chunk in re.split(r"\s{2,}",ln):
        m=re.match(r"\s*([A-Za-z][A-Za-z0-9,\-'() ]{3,55}?)\s+<?\s*[0-9][0-9,\.]*",chunk)
        if m:
            nm=norm(m.group(1))
            if len(nm)>=5: fraw.add(nm)
exact=sorted(fraw & cpdb_norm)
print(f"\n=== Automated whole-appendix match ===")
print(f"Frawley names parsed (>=5 chars, normalized): {len(fraw)}")
print(f"CPDB carcinogens (distinct normalized names): {len(cpdb_norm)}")
print(f"Frawley names that exactly match a CPDB carcinogen: {len(exact)}")
for e in exact: print("   =",e)