import requests from bs4 import BeautifulSoup import time import os from langchain_huggingface import HuggingFaceEmbeddings #from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Chroma from langchain.document_loaders import Docx2txtLoader from langchain.text_splitter import RecursiveCharacterTextSplitter def search_jobs(query: str, num: int = 5): """根據職缺關鍵字,爬蟲取得104人力銀行上相符的職缺,並提供職缺相關資訊,包含職務描述、連結等。 參數: query: 職缺搜尋關鍵字,例如 "資料分析" num: 需要的職缺數量 (預設為 5) 輸出: 會輸出 (num + 5) 個職缺,避免抓到是打廣告的職缺 "title": 職務名稱 "company": 公司 "industry": 公司產業 "location": 地點 "experience": 經驗要求 "education": 教育程度要求 "salary": 薪資 "description": 工作描述 "job_link": 職務連結 "company_link": 公司連結, """ print('!! search_jobs() is called !!') # Construct search URL url = f"https://www.104.com.tw/jobs/search/?ro=0&keyword={query}&order=16" response = requests.get(url) # Check if the request was successful if response.status_code != 200: print("Failed to retrieve page") return [] soup = BeautifulSoup(response.text, "html.parser") job_list = [] seen_jobs = set() # Extract job postings from the search results job_elements = soup.find_all("div", class_="col") for job in job_elements: # Extract job details title_elem = job.find("a", class_="info-job__text") company_elem = job.find("a", class_="info-company__text") industry_elem = job.find("span", class_="info-company-addon-type") location_elem = job.find( "a", attrs={"data-gtm-joblist": lambda x: x and "地區" in x}) experience_elem = job.find( "a", attrs={"data-gtm-joblist": lambda x: x and "經歷" in x}) education_elem = job.find( "a", attrs={"data-gtm-joblist": lambda x: x and "學歷" in x}) salary_elem = job.find( "a", attrs={"data-gtm-joblist": lambda x: x and "薪資" in x}) description_elem = job.find("div", class_="info-description") job_info = { "title": title_elem.text.strip() if title_elem else None, "company": company_elem.text.strip() if company_elem else None, "industry": industry_elem.text.strip() if industry_elem else None, "location": location_elem.text.strip() if location_elem else None, "experience": experience_elem.text.strip() if experience_elem else None, "education": education_elem.text.strip() if education_elem else None, "salary": salary_elem.text.strip() if salary_elem else None, "description": description_elem.text.strip() if description_elem else None, "job_link": title_elem["href"] if title_elem else None, "company_link": company_elem["href"] if company_elem else None, } # Avoid duplicate job postings if None not in job_info.values( ) and job_info["title"] not in seen_jobs: seen_jobs.add(job_info["title"]) job_list.append(job_info) # Stop when enough job postings are collected if len(job_list) >= num + 5: break # Retrieve full job descriptions from individual job links for job in job_list: if job["job_link"]: try: job_response = requests.get(job["job_link"]) if job_response.status_code == 200: job_soup = BeautifulSoup(job_response.text, "html.parser") description_div = job_soup.find( "div", class_="job-description col-12") job["description"] = "\n".join( p.text.strip() for p in description_div.find_all( "p")) if description_div else "Cannot retrieve" time.sleep( 1 ) # Prevent excessive requests that may trigger IP blocking except Exception as e: print(f"Failed to fetch job description: {e}") job["description"] = "Cannot retrieve" return job_list def retrieve_resume_info(query: str): """ 檢索履歷相關內容。 當使用者詢問與履歷相關的問題時,例如「學歷」、「工作經歷」、「曾任職公司」、「職位名稱」、「專案經驗」、「作品展示」,此函數將從履歷資料庫中檢索最相似的幾句話。 注意: - **此函數僅返回履歷中找到的原始內容,不會生成新答案**。 - 若履歷中無相關資訊,則可能返回空結果。 - **所有與個人履歷相關的問題(如「你在哪些公司工作過?」「請描述你的工作經歷」「你的學歷是?」),都應調用此函數進行檢索,而非讓模型自行生成答案**。 參數: - `query (str)`: 使用者的提問,必須與履歷內容相關,例如: - 「你的學歷是什麼?」 - 「你在哪些公司工作過?」 - 「請用一句話描述你的工作經歷。」 - 「你做過哪些專案?」 - 「請提供你的作品集。」 返回: - `list[str]`: 從履歷中檢索出的最相關句子(最多 5 條)。 """ print('!! retrieve_resume_info() is called !!') import os os.environ["HF_HOME"] = "/tmp/huggingface" os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface" # Initialize embedding function cv_path = "./CV.docx" vector_dir = "./vector" model_name = "BAAI/bge-large-zh-v1.5" #embeddings = HuggingFaceEmbeddings(model_name=model_name) # 設置模型參數 model_kwargs = {'device': 'cpu'} # 如果有 GPU,改成 'cuda' encode_kwargs = {'normalize_embeddings': False} # 是否標準化向量 # 初始化 Hugging Face Embeddings embeddings = HuggingFaceEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) # Check if the vector database already exists if os.path.exists(vector_dir): db = Chroma(persist_directory=vector_dir, embedding_function=embeddings) else: # Load the resume document loader = Docx2txtLoader(cv_path) data = loader.load() # Ensure `data` is a list and apply appropriate text splitting text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) docs = text_splitter.split_documents( data ) # Ensures Chroma stores small paragraphs instead of the entire resume # Create the vector database db = Chroma.from_documents(docs, embeddings, persist_directory=vector_dir) db.persist() # Set up the retriever retriever = db.as_retriever(search_kwargs={"k": 5}) # Perform retrieval retrieved_docs = retriever.get_relevant_documents(query) # Return only the retrieved text return [doc.page_content for doc in retrieved_docs]