Spaces:

yjhuang
/

llm-project-api

Sleeping

File size: 7,481 Bytes

import requests
from bs4 import BeautifulSoup
import time

import os
from langchain_huggingface import HuggingFaceEmbeddings
#from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


def search_jobs(query: str, num: int = 5):
    """根據職缺關鍵字，爬蟲取得104人力銀行上相符的職缺，並提供職缺相關資訊，包含職務描述、連結等。
    參數:
        query: 職缺搜尋關鍵字，例如 "資料分析"
        num: 需要的職缺數量 (預設為 5)
    輸出:
        會輸出 (num + 5) 個職缺，避免抓到是打廣告的職缺
        "title": 職務名稱
        "company": 公司
        "industry": 公司產業
        "location": 地點
        "experience": 經驗要求
        "education": 教育程度要求
        "salary": 薪資
        "description": 工作描述
        "job_link": 職務連結
        "company_link": 公司連結,
    """
    print('!! search_jobs() is called !!')

    # Construct search URL
    url = f"https://www.104.com.tw/jobs/search/?ro=0&keyword={query}&order=16"
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        print("Failed to retrieve page")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    job_list = []
    seen_jobs = set()

    # Extract job postings from the search results
    job_elements = soup.find_all("div", class_="col")

    for job in job_elements:
        # Extract job details
        title_elem = job.find("a", class_="info-job__text")
        company_elem = job.find("a", class_="info-company__text")
        industry_elem = job.find("span", class_="info-company-addon-type")
        location_elem = job.find(
            "a", attrs={"data-gtm-joblist": lambda x: x and "地區" in x})
        experience_elem = job.find(
            "a", attrs={"data-gtm-joblist": lambda x: x and "經歷" in x})
        education_elem = job.find(
            "a", attrs={"data-gtm-joblist": lambda x: x and "學歷" in x})
        salary_elem = job.find(
            "a", attrs={"data-gtm-joblist": lambda x: x and "薪資" in x})
        description_elem = job.find("div", class_="info-description")

        job_info = {
            "title": title_elem.text.strip() if title_elem else None,
            "company": company_elem.text.strip() if company_elem else None,
            "industry": industry_elem.text.strip() if industry_elem else None,
            "location": location_elem.text.strip() if location_elem else None,
            "experience":
            experience_elem.text.strip() if experience_elem else None,
            "education":
            education_elem.text.strip() if education_elem else None,
            "salary": salary_elem.text.strip() if salary_elem else None,
            "description":
            description_elem.text.strip() if description_elem else None,
            "job_link": title_elem["href"] if title_elem else None,
            "company_link": company_elem["href"] if company_elem else None,
        }

        # Avoid duplicate job postings
        if None not in job_info.values(
        ) and job_info["title"] not in seen_jobs:
            seen_jobs.add(job_info["title"])
            job_list.append(job_info)

        # Stop when enough job postings are collected
        if len(job_list) >= num + 5:
            break

    # Retrieve full job descriptions from individual job links
    for job in job_list:
        if job["job_link"]:
            try:
                job_response = requests.get(job["job_link"])
                if job_response.status_code == 200:
                    job_soup = BeautifulSoup(job_response.text, "html.parser")
                    description_div = job_soup.find(
                        "div", class_="job-description col-12")
                    job["description"] = "\n".join(
                        p.text.strip() for p in description_div.find_all(
                            "p")) if description_div else "Cannot retrieve"

                time.sleep(
                    1
                )  # Prevent excessive requests that may trigger IP blocking

            except Exception as e:
                print(f"Failed to fetch job description: {e}")
                job["description"] = "Cannot retrieve"

    return job_list


def retrieve_resume_info(query: str):
    """
    檢索履歷相關內容。
    當使用者詢問與履歷相關的問題時，例如「學歷」、「工作經歷」、「曾任職公司」、「職位名稱」、「專案經驗」、「作品展示」，此函數將從履歷資料庫中檢索最相似的幾句話。
    注意：
    - **此函數僅返回履歷中找到的原始內容，不會生成新答案**。
    - 若履歷中無相關資訊，則可能返回空結果。
    - **所有與個人履歷相關的問題（如「你在哪些公司工作過？」「請描述你的工作經歷」「你的學歷是？」），都應調用此函數進行檢索，而非讓模型自行生成答案**。
    參數：
    - `query (str)`: 使用者的提問，必須與履歷內容相關，例如：
    - 「你的學歷是什麼？」
    - 「你在哪些公司工作過？」
    - 「請用一句話描述你的工作經歷。」
    - 「你做過哪些專案？」
    - 「請提供你的作品集。」
    返回：
    - `list[str]`: 從履歷中檢索出的最相關句子（最多 5 條）。
    """

    print('!! retrieve_resume_info() is called !!')

    import os
    os.environ["HF_HOME"] = "/tmp/huggingface"
    os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"


    # Initialize embedding function
    cv_path = "./CV.docx"
    vector_dir = "./vector"
    model_name = "BAAI/bge-large-zh-v1.5"
    
    #embeddings = HuggingFaceEmbeddings(model_name=model_name)
    # 設置模型參數
    model_kwargs = {'device': 'cpu'}  # 如果有 GPU，改成 'cuda'
    encode_kwargs = {'normalize_embeddings': False}  # 是否標準化向量

    # 初始化 Hugging Face Embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )



    # Check if the vector database already exists
    if os.path.exists(vector_dir):
        db = Chroma(persist_directory=vector_dir,
                    embedding_function=embeddings)
    else:
        # Load the resume document
        loader = Docx2txtLoader(cv_path)
        data = loader.load()

        # Ensure `data` is a list and apply appropriate text splitting
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                                       chunk_overlap=100)
        docs = text_splitter.split_documents(
            data
        )  # Ensures Chroma stores small paragraphs instead of the entire resume

        # Create the vector database
        db = Chroma.from_documents(docs,
                                   embeddings,
                                   persist_directory=vector_dir)
        db.persist()

    # Set up the retriever
    retriever = db.as_retriever(search_kwargs={"k": 5})

    # Perform retrieval
    retrieved_docs = retriever.get_relevant_documents(query)

    # Return only the retrieved text
    return [doc.page_content for doc in retrieved_docs]