import os import time import math import re import torch import nltk from sentence_transformers import SentenceTransformer, CrossEncoder, util from nltk.tokenize import sent_tokenize from typing import Dict def download_nltk_data(): try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt', quiet=True) try: nltk.data.find('tokenizers/punkt_tab') except LookupError: nltk.download('punkt_tab', quiet=True) except Exception: nltk.download('punkt', quiet=True) download_nltk_data() from .pdf_processor import extract_section def _normalize_sts(raw: float) -> float: if raw is None: return 0.0 try: r = float(raw) except Exception: return 0.0 if r > 1.5: return max(0.0, min(1.0, r / 5.0)) if 0.0 <= r <= 1.0: return r return 1.0 / (1.0 + math.exp(-r)) def _normalize_cosine(val: float) -> float: """Map cosine similarity (-1..1) to [0,1] and clamp.""" try: v = float(val) except Exception: return 0.0 return max(0.0, min(1.0, (v + 1.0) / 2.0)) _DEFAULT_MINMAX_LOW = float(os.environ.get("SEMANTIC_MIN", 0.5)) _DEFAULT_MINMAX_HIGH = float(os.environ.get("SEMANTIC_MAX", 0.75)) def calibrate_minmax(raw: float, low: float = _DEFAULT_MINMAX_LOW, high: float = _DEFAULT_MINMAX_HIGH) -> float: """Stretch an empirical range [low, high] to [0,1] with clamping.""" if high <= low: return max(0.0, min(1.0, raw)) return max(0.0, min(1.0, (float(raw) - low) / (high - low))) _STOPWORDS = set( "the a an and or of to in for with on at by from as is are was were be been being this that these those your our their his her its it you we they i not no but if then so than into over under within about across after before during without while do does did done can could should would may might must will shall use using used able ability experience experiences role roles strong hands-on solid good great excellent knowledge understanding understandingly understanding's having have has had including include includes demonstrated proven track record working work works".split() ) def extract_keywords(text: str, max_k: int = 20) -> list: """Crude keyword extractor: alnum tokens of len 2..30, filtered by stopwords, deduped, order preserved.""" if not text: return [] toks = re.findall(r"[A-Za-z0-9+#\.\-]{2,30}", text) seen = set() out = [] for t in toks: tl = t.lower() if tl in _STOPWORDS: continue if tl.isdigit(): continue if tl not in seen: seen.add(tl) out.append(tl) if len(out) >= max_k: break return out def calibrate_fit_score(raw_score: float) -> float: if raw_score is None: return 0.0 stretched = (float(raw_score) - 0.45) / 0.30 return max(0.0, min(1.0, stretched)) class ResumeScorer: def __init__(self, model_name: str = None, use_cross_encoder: bool = True): """Resume scoring with Bi-Encoder + STS Cross-Encoder blending. - Bi-Encoder: fast semantic retrieval (BGE default) - STS Cross-Encoder: precise semantic similarity (stsb-roberta-base) """ if model_name is None: model_name = os.environ.get("MODEL_NAME", "BAAI/bge-large-en-v1.5") self.use_cross_encoder = use_cross_encoder device = "cuda" if torch.cuda.is_available() else "cpu" self.model = SentenceTransformer(model_name, device=device) self.sts_ce = None if self.use_cross_encoder: sts_name = os.environ.get("CROSS_ENCODER_STS_MODEL", "cross-encoder/stsb-roberta-base") self.sts_ce = CrossEncoder(sts_name, device=device) self.batch_size = 64 if device == "cuda" else 32 def smart_similarity(self, jd_text: str, resume_text: str) -> float: """Compute similarity for a section using blended Bi-Encoder + STS CE. Returns a normalized score in [0,1]. """ sentences = [s.strip() for s in sent_tokenize(resume_text) if len(s.strip()) > 20] if not sentences: return 0.0 # Bi-Encoder cosine (normalize embeddings so cosine behaves nicely) jd_emb = self.model.encode([jd_text], convert_to_tensor=True, show_progress_bar=False, normalize_embeddings=True) res_embs = self.model.encode( sentences, convert_to_tensor=True, show_progress_bar=False, batch_size=self.batch_size, normalize_embeddings=True, ) sims = util.cos_sim(jd_emb, res_embs)[0] topk = min(3, len(sims)) bi_top_mean = float(torch.topk(sims, k=topk).values.mean().item()) bi_norm = _normalize_cosine(bi_top_mean) # STS Cross-Encoder (if available) ce_norm = None if self.use_cross_encoder and self.sts_ce is not None: pairs = [[jd_text, s] for s in sentences] try: sts_scores = self.sts_ce.predict(pairs, show_progress_bar=False, batch_size=16) # ensure list of floats if hasattr(sts_scores, "tolist"): sts_scores = sts_scores.tolist() sts_scores = [float(x) for x in sts_scores] topk_ce = min(3, len(sts_scores)) ce_top_mean = sum(sorted(sts_scores, reverse=True)[:topk_ce]) / topk_ce ce_norm = _normalize_sts(ce_top_mean) except Exception as e: print(f"āš ļø STS Cross-Encoder failed: {e}") ce_norm = None # Blend scores (prefer CE when available) combined = (0.6 * ce_norm + 0.4 * bi_norm) if ce_norm is not None else bi_norm score = max(0.0, min(1.0, float(combined))) print(f"šŸ”Ž bi={bi_norm:.3f}, sts={(ce_norm if ce_norm is not None else 'n/a')}, → combined={score:.3f}") return round(score, 3) # ------------------------------------------------------------- # 🧮 score_resume() — section-wise scoring and final result # ------------------------------------------------------------- def score_resume(self, job_description: str, resume_text: str) -> Dict[str, float]: start = time.time() sections = { "skills": extract_section(resume_text, ["skills", "technical skills", "tech stack"]), "projects": extract_section(resume_text, ["projects", "portfolio"]), "experience": extract_section(resume_text, ["experience", "internship", "work experience"]), } scores = {} for name, content in sections.items(): print(f"\n🧩 Scoring section: {name}") scores[name] = self.smart_similarity(job_description, content) for k, v in list(scores.items()): try: fv = float(v) except Exception: fv = 0.0 if fv > 1.5: fv = fv / 100.0 print(f"āš ļø Sanitized {k} from percent to ratio: {v} -> {fv}") fv = max(0.0, min(1.0, fv)) scores[k] = round(fv, 3) raw_final = ( 0.35 * scores["skills"] + 0.40 * scores["projects"] + 0.25 * scores["experience"] ) calibrated_final = calibrate_fit_score(raw_final) final_score = calibrated_final elapsed = time.time() - start print(f"āœ… Raw={raw_final:.3f} → Calibrated={calibrated_final:.3f} | Time: {elapsed:.2f}s") return { "skills_score": round(scores["skills"], 3), "projects_score": round(scores["projects"], 3), "experience_score": round(scores["experience"], 3), "final_score": round(final_score, 3), "skills_score_percent": round(scores["skills"] * 100.0, 1), "projects_score_percent": round(scores["projects"] * 100.0, 1), "experience_score_percent": round(scores["experience"] * 100.0, 1), "final_score_percent": round(final_score * 100.0, 1), }