Spaces:
Sleeping
Sleeping
File size: 7,481 Bytes
d1c0892 31bbc21 d1c0892 5ebc995 d1c0892 31bbc21 5ebc995 d1c0892 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import requests
from bs4 import BeautifulSoup
import time
import os
from langchain_huggingface import HuggingFaceEmbeddings
#from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def search_jobs(query: str, num: int = 5):
"""根據職缺關鍵字,爬蟲取得104人力銀行上相符的職缺,並提供職缺相關資訊,包含職務描述、連結等。
參數:
query: 職缺搜尋關鍵字,例如 "資料分析"
num: 需要的職缺數量 (預設為 5)
輸出:
會輸出 (num + 5) 個職缺,避免抓到是打廣告的職缺
"title": 職務名稱
"company": 公司
"industry": 公司產業
"location": 地點
"experience": 經驗要求
"education": 教育程度要求
"salary": 薪資
"description": 工作描述
"job_link": 職務連結
"company_link": 公司連結,
"""
print('!! search_jobs() is called !!')
# Construct search URL
url = f"https://www.104.com.tw/jobs/search/?ro=0&keyword={query}&order=16"
response = requests.get(url)
# Check if the request was successful
if response.status_code != 200:
print("Failed to retrieve page")
return []
soup = BeautifulSoup(response.text, "html.parser")
job_list = []
seen_jobs = set()
# Extract job postings from the search results
job_elements = soup.find_all("div", class_="col")
for job in job_elements:
# Extract job details
title_elem = job.find("a", class_="info-job__text")
company_elem = job.find("a", class_="info-company__text")
industry_elem = job.find("span", class_="info-company-addon-type")
location_elem = job.find(
"a", attrs={"data-gtm-joblist": lambda x: x and "地區" in x})
experience_elem = job.find(
"a", attrs={"data-gtm-joblist": lambda x: x and "經歷" in x})
education_elem = job.find(
"a", attrs={"data-gtm-joblist": lambda x: x and "學歷" in x})
salary_elem = job.find(
"a", attrs={"data-gtm-joblist": lambda x: x and "薪資" in x})
description_elem = job.find("div", class_="info-description")
job_info = {
"title": title_elem.text.strip() if title_elem else None,
"company": company_elem.text.strip() if company_elem else None,
"industry": industry_elem.text.strip() if industry_elem else None,
"location": location_elem.text.strip() if location_elem else None,
"experience":
experience_elem.text.strip() if experience_elem else None,
"education":
education_elem.text.strip() if education_elem else None,
"salary": salary_elem.text.strip() if salary_elem else None,
"description":
description_elem.text.strip() if description_elem else None,
"job_link": title_elem["href"] if title_elem else None,
"company_link": company_elem["href"] if company_elem else None,
}
# Avoid duplicate job postings
if None not in job_info.values(
) and job_info["title"] not in seen_jobs:
seen_jobs.add(job_info["title"])
job_list.append(job_info)
# Stop when enough job postings are collected
if len(job_list) >= num + 5:
break
# Retrieve full job descriptions from individual job links
for job in job_list:
if job["job_link"]:
try:
job_response = requests.get(job["job_link"])
if job_response.status_code == 200:
job_soup = BeautifulSoup(job_response.text, "html.parser")
description_div = job_soup.find(
"div", class_="job-description col-12")
job["description"] = "\n".join(
p.text.strip() for p in description_div.find_all(
"p")) if description_div else "Cannot retrieve"
time.sleep(
1
) # Prevent excessive requests that may trigger IP blocking
except Exception as e:
print(f"Failed to fetch job description: {e}")
job["description"] = "Cannot retrieve"
return job_list
def retrieve_resume_info(query: str):
"""
檢索履歷相關內容。
當使用者詢問與履歷相關的問題時,例如「學歷」、「工作經歷」、「曾任職公司」、「職位名稱」、「專案經驗」、「作品展示」,此函數將從履歷資料庫中檢索最相似的幾句話。
注意:
- **此函數僅返回履歷中找到的原始內容,不會生成新答案**。
- 若履歷中無相關資訊,則可能返回空結果。
- **所有與個人履歷相關的問題(如「你在哪些公司工作過?」「請描述你的工作經歷」「你的學歷是?」),都應調用此函數進行檢索,而非讓模型自行生成答案**。
參數:
- `query (str)`: 使用者的提問,必須與履歷內容相關,例如:
- 「你的學歷是什麼?」
- 「你在哪些公司工作過?」
- 「請用一句話描述你的工作經歷。」
- 「你做過哪些專案?」
- 「請提供你的作品集。」
返回:
- `list[str]`: 從履歷中檢索出的最相關句子(最多 5 條)。
"""
print('!! retrieve_resume_info() is called !!')
import os
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
# Initialize embedding function
cv_path = "./CV.docx"
vector_dir = "./vector"
model_name = "BAAI/bge-large-zh-v1.5"
#embeddings = HuggingFaceEmbeddings(model_name=model_name)
# 設置模型參數
model_kwargs = {'device': 'cpu'} # 如果有 GPU,改成 'cuda'
encode_kwargs = {'normalize_embeddings': False} # 是否標準化向量
# 初始化 Hugging Face Embeddings
embeddings = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
# Check if the vector database already exists
if os.path.exists(vector_dir):
db = Chroma(persist_directory=vector_dir,
embedding_function=embeddings)
else:
# Load the resume document
loader = Docx2txtLoader(cv_path)
data = loader.load()
# Ensure `data` is a list and apply appropriate text splitting
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
chunk_overlap=100)
docs = text_splitter.split_documents(
data
) # Ensures Chroma stores small paragraphs instead of the entire resume
# Create the vector database
db = Chroma.from_documents(docs,
embeddings,
persist_directory=vector_dir)
db.persist()
# Set up the retriever
retriever = db.as_retriever(search_kwargs={"k": 5})
# Perform retrieval
retrieved_docs = retriever.get_relevant_documents(query)
# Return only the retrieved text
return [doc.page_content for doc in retrieved_docs]
|