#!/usr/bin/env python3
import os, re, time, socket, subprocess
import smtplib

import pymysql

# ----------------- ENV CONFIG -----------------
DB_HOST = os.getenv("DB_HOST", "localhost")
DB_NAME = os.getenv("DB_NAME", "email_cleaner")
DB_USER = os.getenv("DB_USER", "")
DB_PASS = os.getenv("DB_PASS", "")

VERIFY_MAIL_FROM = os.getenv("VERIFY_MAIL_FROM", "verify@archclinbiomedres.com")
VERIFY_HELO = os.getenv("VERIFY_HELO", "archclinbiomedres.com")

LIMIT_PER_RUN = int(os.getenv("SMTP_LIMIT", "60"))          # per cron run
SMTP_TIMEOUT = int(os.getenv("SMTP_TIMEOUT", "12"))         # seconds
PER_DOMAIN_LIMIT = int(os.getenv("PER_DOMAIN_LIMIT", "8"))  # per run per domain
SLEEP_BETWEEN = float(os.getenv("SLEEP_BETWEEN", "0.3"))    # seconds between checks

EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")

def mx_lookup(domain: str):
    """
    Return MX hosts sorted by priority.
    dnspython if available, else dig fallback.
    """
    try:
        import dns.resolver  # type: ignore
        answers = dns.resolver.resolve(domain, "MX", lifetime=5)
        mx = sorted([(r.preference, str(r.exchange).rstrip(".")) for r in answers], key=lambda x: x[0])
        return [h for _, h in mx]
    except Exception:
        pass

    try:
        out = subprocess.check_output(["dig", "+short", "MX", domain], timeout=6).decode("utf-8", "ignore")
        mx = []
        for line in out.splitlines():
            parts = line.strip().split()
            if len(parts) >= 2:
                try:
                    mx.append((int(parts[0]), parts[1].rstrip(".")))
                except Exception:
                    continue
        mx.sort(key=lambda x: x[0])
        return [h for _, h in mx]
    except Exception:
        return []

def smtp_rcpt_check(mx_host: str, rcpt: str):
    """
    Returns: (status, reason_code, smtp_code, smtp_detail)
    """
    try:
        server = smtplib.SMTP(mx_host, 25, timeout=SMTP_TIMEOUT)
        server.set_debuglevel(0)
        server.ehlo(VERIFY_HELO)

        server.mail(VERIFY_MAIL_FROM)
        code, msg = server.rcpt(rcpt)

        try:
            server.quit()
        except Exception:
            pass

        detail = (msg.decode("utf-8", "ignore") if isinstance(msg, (bytes, bytearray)) else str(msg))[:240]

        if 200 <= code < 300:
            return ("VALID", "SMTP_250_OK", int(code), detail)
        if code in (550, 551, 553):
            return ("INVALID", "SMTP_550_NOUSER", int(code), detail)
        if 400 <= code < 500:
            return ("RISKY", "SMTP_4XX_TEMP", int(code), detail)
        return ("RISKY", "SMTP_OTHER", int(code), detail)

    except (socket.timeout, TimeoutError):
        return ("RISKY", "SMTP_TIMEOUT", None, "timeout")
    except Exception as e:
        return ("RISKY", "SMTP_ERROR", None, str(e)[:240])

def db_connect():
    return pymysql.connect(
        host=DB_HOST,
        user=DB_USER,
        password=DB_PASS,
        database=DB_NAME,
        charset="utf8mb4",
        autocommit=False,
        cursorclass=pymysql.cursors.DictCursor,
    )

def refresh_job_counts(cur, job_id: int):
    cur.execute("""
      UPDATE jobs SET
        processed_emails = (SELECT COUNT(*) FROM job_emails WHERE job_id=%s AND reason_code<>'PENDING'),
        count_valid      = (SELECT COUNT(*) FROM job_emails WHERE job_id=%s AND status='VALID'),
        count_invalid    = (SELECT COUNT(*) FROM job_emails WHERE job_id=%s AND status='INVALID'),
        count_risky      = (SELECT COUNT(*) FROM job_emails WHERE job_id=%s AND status='RISKY'),
        count_disposable = (SELECT COUNT(*) FROM job_emails WHERE job_id=%s AND status='DISPOSABLE'),
        count_role       = (SELECT COUNT(*) FROM job_emails WHERE job_id=%s AND status='ROLE'),
        count_duplicate  = (SELECT COUNT(*) FROM job_emails WHERE job_id=%s AND status='DUPLICATE'),
        count_suppressed = (SELECT COUNT(*) FROM job_emails WHERE job_id=%s AND status='SUPPRESSED'),
        updated_at=NOW()
      WHERE id=%s
    """, (job_id, job_id, job_id, job_id, job_id, job_id, job_id, job_id, job_id))

def main():
    if not DB_USER or not DB_PASS:
        print("Missing DB_USER/DB_PASS env vars.")
        return

    cn = db_connect()
    cur = cn.cursor()

    # pick one smtp job at a time
    cur.execute("""
        SELECT id FROM jobs
        WHERE mode='smtp' AND status IN ('queued','running')
        ORDER BY id ASC
        LIMIT 1
    """)
    job = cur.fetchone()
    if not job:
        print("No SMTP jobs.")
        cn.close()
        return

    job_id = int(job["id"])

    # ensure running
    cur.execute("UPDATE jobs SET status='running', updated_at=NOW() WHERE id=%s", (job_id,))
    cn.commit()

    # load SMTP_PENDING emails
    cur.execute("""
        SELECT id, email_normalized, domain
        FROM job_emails
        WHERE job_id=%s AND reason_code='SMTP_PENDING'
        ORDER BY id ASC
        LIMIT %s
    """, (job_id, LIMIT_PER_RUN))
    rows = cur.fetchall()

    if not rows:
        cur.execute("UPDATE jobs SET status='done', updated_at=NOW() WHERE id=%s", (job_id,))
        cn.commit()
        print("Job {} done (no SMTP_PENDING).".format(job_id))
        cn.close()
        return

    domain_used = {}
    processed = 0

    for r in rows:
        email_id = int(r["id"])
        email = (r["email_normalized"] or "").strip().lower()
        domain = (r["domain"] or "").strip().lower()

        if not EMAIL_RE.match(email) or not domain:
            cur.execute("""
                UPDATE job_emails
                SET status='INVALID', reason_code='SYNTAX_FAIL', score=0,
                    smtp_code=NULL, smtp_detail='bad format',
                    updated_at=NOW()
                WHERE id=%s
            """, (email_id,))
            processed += 1
            continue

        domain_used.setdefault(domain, 0)
        if domain_used[domain] >= PER_DOMAIN_LIMIT:
            continue
        domain_used[domain] += 1

        mx_hosts = mx_lookup(domain)
        if not mx_hosts:
            cur.execute("""
                UPDATE job_emails
                SET status='INVALID', reason_code='MX_MISSING', score=0,
                    smtp_code=NULL, smtp_detail='no mx found',
                    updated_at=NOW()
                WHERE id=%s
            """, (email_id,))
            processed += 1
            continue

        best = None
        for mx in mx_hosts[:2]:
            best = smtp_rcpt_check(mx, email)
            if best[0] in ("VALID", "INVALID"):
                break

        status, reason, smtp_code, smtp_detail = best

        if status == "VALID":
            score = 95
        elif status == "INVALID":
            score = 0
        else:
            score = 55

        cur.execute("""
            UPDATE job_emails
            SET status=%s, reason_code=%s, score=%s,
                smtp_code=%s, smtp_detail=%s,
                updated_at=NOW()
            WHERE id=%s
        """, (status, reason, score, smtp_code, smtp_detail, email_id))

        processed += 1
        if processed % 20 == 0:
            cn.commit()

        time.sleep(SLEEP_BETWEEN)

    cn.commit()

    refresh_job_counts(cur, job_id)
    cn.commit()

    cur.execute("SELECT COUNT(*) c FROM job_emails WHERE job_id=%s AND reason_code='SMTP_PENDING'", (job_id,))
    pending = int(cur.fetchone()["c"])
    if pending == 0:
        cur.execute("UPDATE jobs SET status='done', updated_at=NOW() WHERE id=%s", (job_id,))
        cn.commit()

    print("Job {}: processed={}, pending_left={}".format(job_id, processed, pending))
    cn.close()

if __name__ == "__main__":
    main()