import os
import time
import math
import re
import torch
import nltk
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from nltk.tokenize import sent_tokenize
from typing import Dict

def download_nltk_data():
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)

    try:
        nltk.data.find('tokenizers/punkt_tab')
    except LookupError:
        nltk.download('punkt_tab', quiet=True)
    except Exception:
        nltk.download('punkt', quiet=True)

download_nltk_data()

from .pdf_processor import extract_section

def _normalize_sts(raw: float) -> float:
    if raw is None:
        return 0.0
    try:
        r = float(raw)
    except Exception:
        return 0.0
    if r > 1.5:
        return max(0.0, min(1.0, r / 5.0))
    if 0.0 <= r <= 1.0:
        return r
    return 1.0 / (1.0 + math.exp(-r))


def _normalize_cosine(val: float) -> float:
    """Map cosine similarity (-1..1) to [0,1] and clamp."""
    try:
        v = float(val)
    except Exception:
        return 0.0
    return max(0.0, min(1.0, (v + 1.0) / 2.0))


_DEFAULT_MINMAX_LOW = float(os.environ.get("SEMANTIC_MIN", 0.5))
_DEFAULT_MINMAX_HIGH = float(os.environ.get("SEMANTIC_MAX", 0.75))


def calibrate_minmax(raw: float, low: float = _DEFAULT_MINMAX_LOW, high: float = _DEFAULT_MINMAX_HIGH) -> float:
    """Stretch an empirical range [low, high] to [0,1] with clamping."""
    if high <= low:
        return max(0.0, min(1.0, raw))
    return max(0.0, min(1.0, (float(raw) - low) / (high - low)))


_STOPWORDS = set(
    "the a an and or of to in for with on at by from as is are was were be been being this that these those your our their his her its it you we they i not no but if then so than into over under within about across after before during without while do does did done can could should would may might must will shall use using used able ability experience experiences role roles strong hands-on solid good great excellent knowledge understanding understandingly understanding's having have has had including include includes demonstrated proven track record working work works".split()
)


def extract_keywords(text: str, max_k: int = 20) -> list:
    """Crude keyword extractor: alnum tokens of len 2..30, filtered by stopwords, deduped, order preserved."""
    if not text:
        return []
    toks = re.findall(r"[A-Za-z0-9+#\.\-]{2,30}", text)
    seen = set()
    out = []
    for t in toks:
        tl = t.lower()
        if tl in _STOPWORDS:
            continue
        if tl.isdigit():
            continue
        if tl not in seen:
            seen.add(tl)
            out.append(tl)
        if len(out) >= max_k:
            break
    return out

def calibrate_fit_score(raw_score: float) -> float:
    if raw_score is None:
        return 0.0
    stretched = (float(raw_score) - 0.45) / 0.30
    return max(0.0, min(1.0, stretched))


class ResumeScorer:
    def __init__(self, model_name: str = None, use_cross_encoder: bool = True):
        """Resume scoring with Bi-Encoder + STS Cross-Encoder blending.
        - Bi-Encoder: fast semantic retrieval (BGE default)
        - STS Cross-Encoder: precise semantic similarity (stsb-roberta-base)
        """
        if model_name is None:
            model_name = os.environ.get("MODEL_NAME", "BAAI/bge-large-en-v1.5")

        self.use_cross_encoder = use_cross_encoder
        device = "cuda" if torch.cuda.is_available() else "cpu"

        self.model = SentenceTransformer(model_name, device=device)

        self.sts_ce = None
        if self.use_cross_encoder:
            sts_name = os.environ.get("CROSS_ENCODER_STS_MODEL", "cross-encoder/stsb-roberta-base")
            self.sts_ce = CrossEncoder(sts_name, device=device)

        self.batch_size = 64 if device == "cuda" else 32

    def smart_similarity(self, jd_text: str, resume_text: str) -> float:
        """Compute similarity for a section using blended Bi-Encoder + STS CE.
        Returns a normalized score in [0,1].
        """
        sentences = [s.strip() for s in sent_tokenize(resume_text) if len(s.strip()) > 20]
        if not sentences:
            return 0.0

        # Bi-Encoder cosine (normalize embeddings so cosine behaves nicely)
        jd_emb = self.model.encode([jd_text], convert_to_tensor=True, show_progress_bar=False, normalize_embeddings=True)
        res_embs = self.model.encode(
            sentences,
            convert_to_tensor=True,
            show_progress_bar=False,
            batch_size=self.batch_size,
            normalize_embeddings=True,
        )
        sims = util.cos_sim(jd_emb, res_embs)[0]
        topk = min(3, len(sims))
        bi_top_mean = float(torch.topk(sims, k=topk).values.mean().item())
        bi_norm = _normalize_cosine(bi_top_mean)

        # STS Cross-Encoder (if available)
        ce_norm = None
        if self.use_cross_encoder and self.sts_ce is not None:
            pairs = [[jd_text, s] for s in sentences]
            try:
                sts_scores = self.sts_ce.predict(pairs, show_progress_bar=False, batch_size=16)
                # ensure list of floats
                if hasattr(sts_scores, "tolist"):
                    sts_scores = sts_scores.tolist()
                sts_scores = [float(x) for x in sts_scores]
                topk_ce = min(3, len(sts_scores))
                ce_top_mean = sum(sorted(sts_scores, reverse=True)[:topk_ce]) / topk_ce
                ce_norm = _normalize_sts(ce_top_mean)
            except Exception as e:
                print(f"⚠️ STS Cross-Encoder failed: {e}")
                ce_norm = None

        # Blend scores (prefer CE when available)
        combined = (0.6 * ce_norm + 0.4 * bi_norm) if ce_norm is not None else bi_norm
        score = max(0.0, min(1.0, float(combined)))
        print(f"🔎 bi={bi_norm:.3f}, sts={(ce_norm if ce_norm is not None else 'n/a')}, → combined={score:.3f}")
        return round(score, 3)


    # -------------------------------------------------------------
    # 🧮 score_resume() — section-wise scoring and final result
    # -------------------------------------------------------------
    def score_resume(self, job_description: str, resume_text: str) -> Dict[str, float]:
        start = time.time()

        sections = {
            "skills": extract_section(resume_text, ["skills", "technical skills", "tech stack"]),
            "projects": extract_section(resume_text, ["projects", "portfolio"]),
            "experience": extract_section(resume_text, ["experience", "internship", "work experience"]),
        }

        scores = {}
        for name, content in sections.items():
            print(f"\n🧩 Scoring section: {name}")
            scores[name] = self.smart_similarity(job_description, content)

        for k, v in list(scores.items()):
            try:
                fv = float(v)
            except Exception:
                fv = 0.0
            if fv > 1.5:
                fv = fv / 100.0
                print(f"⚠️ Sanitized {k} from percent to ratio: {v} -> {fv}")
            fv = max(0.0, min(1.0, fv))
            scores[k] = round(fv, 3)

        raw_final = (
            0.35 * scores["skills"] +
            0.40 * scores["projects"] +
            0.25 * scores["experience"]
        )
        calibrated_final = calibrate_fit_score(raw_final)
        final_score = calibrated_final

        elapsed = time.time() - start
        print(f"✅ Raw={raw_final:.3f} → Calibrated={calibrated_final:.3f}  |  Time: {elapsed:.2f}s")

        return {
            "skills_score": round(scores["skills"], 3),
            "projects_score": round(scores["projects"], 3),
            "experience_score": round(scores["experience"], 3),
            "final_score": round(final_score, 3),
            "skills_score_percent": round(scores["skills"] * 100.0, 1),
            "projects_score_percent": round(scores["projects"] * 100.0, 1),
            "experience_score_percent": round(scores["experience"] * 100.0, 1),
            "final_score_percent": round(final_score * 100.0, 1),
        }