#!/usr/bin/env python3
"""Reproduce Rulis's Threshold-of-Regulation number from the CPDB potency distribution,
and cross-reference Frawley's 220-compound 'safe' table against the CPDB carcinogens.

Data:
  papers/cpdb/CPDBChemical.xls  (NLM Carcinogenic Potency Database; sheet 'Rats and Mice')
  papers/f1967.txt              (OCR of Frawley 1967, incl. the 220-compound Appendix)
"""
import re, numpy as np, pandas as pd

# ---------------------------------------------------------------- CPDB load
def num(x):
    if x is None: return np.nan
    s = str(x).strip()
    if not s or s[0] in ".–-—": return np.nan
    m = re.match(r"([0-9]+\.?[0-9]*(?:[eE][-+]?[0-9]+)?)", s)
    return float(m.group(1)) if m else np.nan

rm = pd.read_excel("papers/cpdb/CPDBChemical.xls", sheet_name="Rats and Mice",
                   engine="calamine", header=None, skiprows=2)
rm = rm.rename(columns={0:"name",1:"cas",2:"salm",3:"td50_rat",4:"td50_mouse"})
rm = rm[["name","cas","salm","td50_rat","td50_mouse"]].dropna(subset=["name"])
rm["td50_rat_n"]  = rm["td50_rat"].map(num)
rm["td50_mouse_n"]= rm["td50_mouse"].map(num)
rm["td50"] = rm[["td50_rat_n","td50_mouse_n"]].min(axis=1)          # most potent of the two
carc = rm.dropna(subset=["td50"]).copy()                            # positive carcinogens
print(f"CPDB 'Rats and Mice' rows: {len(rm)} | positive carcinogens (numeric TD50): {len(carc)}")
print(f"TD50 (mg/kg/day): median={carc.td50.median():.3g}  geomean={np.exp(np.log(carc.td50).mean()):.3g}  "
      f"min={carc.td50.min():.3g}  max={carc.td50.max():.3g}")

# ---------------------------------------------------------------- Rulis reproduction
# Rulis/Flamm: low-dose risk = potency x dose, potency = slope from TD50 to zero = 0.5/TD50  (per mg/kg/day).
# Diet->dose: FDA convention 60 kg person eating 3 kg food+water/day:
#   1 ppb diet (= ug/kg food) -> 3 ug/day -> /60 kg = 0.05 ug/kg/day = 5e-5 mg/kg/day.
#   (Flamm/Rulis: "1 ppt corresponds to 1e-7..1e-8 mg/kg/day"; here 1 ppt = 5e-8 mg/kg/day, in range.)
K = 5e-5            # mg/kg/day per ppb dietary
RISK = 1e-6        # one-in-a-million lifetime

# concentration (ppb) at which carcinogen i gives exactly RISK:  C_i = RISK / ((0.5/TD50)*K) = RISK*TD50/(0.5*K)
carc["conc_1e6_ppb"] = RISK * carc.td50 / (0.5*K)

def pct_exceed(C_ppb):
    """fraction of carcinogens whose lifetime risk at dietary conc C exceeds RISK."""
    return float((carc.conc_1e6_ppb < C_ppb).mean())

print("\n--- Reproduction of Rulis's curve 5 (fraction of carcinogens with lifetime risk > 1e-6) ---")
for C,label in [(5,"5 ppb (a level 'suggested by some')"),
                (1,"1 ppb (Rulis: ~half on either side)"),
                (0.5,"0.5 ppb (the 1995 ToR rule)"),
                (0.05,"0.05 ppb = 50 ppt diet (Rulis's worked example)")]:
    print(f"  at {label:<46}: {pct_exceed(C)*100:5.1f}% exceed 1e-6   (excluded: {100-pct_exceed(C)*100:.1f}%)")

# The level that keeps all but the most potent p% of carcinogens under 1e-6 = the p-th percentile of conc_1e6
for p in [50,15,10,5]:
    lvl = np.percentile(carc.conc_1e6_ppb, p)
    print(f"  level holding the least-potent {100-p:.0f}% of carcinogens under 1e-6: {lvl:.3g} ppb "
          f"(the most-potent {p}% would exceed)")

# Rulis's 1-in-5-is-a-carcinogen bookkeeping at the 1995 ToR (0.5 ppb):
f_exceed_05 = pct_exceed(0.5)
share_decisions_over = 0.2 * f_exceed_05
print(f"\n  Rulis bookkeeping at 0.5 ppb: if 1 in 5 unknown migrants is a carcinogen, share of ToR decisions "
      f">1e-6 = 0.2 x {f_exceed_05:.2f} = {share_decisions_over*100:.1f}%  "
      f"(=> ~{100-share_decisions_over*100:.0f} of 100 decisions <= 1e-6)")

# ---------------------------------------------------------------- Frawley 220 cross-reference
def norm(s):
    s = str(s).lower()
    s = re.sub(r"\(.*?\)", " ", s)                 # drop parentheticals
    s = re.sub(r"[^a-z0-9]+", " ", s).strip()
    s = re.sub(r"\b\d+\b", " ", s)                 # drop bare numbers/superscripts
    return re.sub(r"\s+", " ", s).strip()

# parse Frawley appendix (two-column): pull (name, ppm) pairs
txt = open("papers/f1967.txt", encoding="utf-8", errors="ignore").read().splitlines()
ap = []
grab = False
for ln in txt:
    if "APPENDIX" in ln: grab = True; continue
    if grab and re.match(r"\s*1\.\s", ln): break            # references start
    if not grab: continue
    # split the two columns on big gaps, then find 'name .... number' in each chunk
    for chunk in re.split(r"\s{3,}", ln):
        m = re.match(r"\s*([A-Za-z].{2,60}?)\s+<?\s*([0-9][0-9,\.]*)\s*[\*†T'\"]*\s*$", chunk)
        if m:
            nm = norm(m.group(1))
            if len(nm) >= 4: ap.append(nm)
fraw = set(ap)
print(f"\nFrawley appendix: parsed ~{len(fraw)} distinct compound names (of 220)")

cpdb_names = {norm(n): n for n in carc["name"]}
cpdb_norm = set(cpdb_names)
exact = sorted(fraw & cpdb_norm)
# also substring (one normalized name contained in the other, len>5) for synonyms
partial = []
for fn in fraw - cpdb_norm:
    for cn in cpdb_norm:
        if len(fn) > 5 and (fn in cn or cn in fn):
            partial.append((fn, cn)); break
print(f"Frawley compounds that are CPDB carcinogens — exact-name matches: {len(exact)}")
for e in exact: print("   =", e)
print(f"...additional likely matches (substring): {len(partial)}")
for fn,cn in partial[:25]: print("   ~", fn, "->", cn)