File size: 7,481 Bytes
d1c0892
 
 
 
 
31bbc21
 
d1c0892
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ebc995
 
 
 
 
d1c0892
 
 
 
31bbc21
5ebc995
 
 
 
 
 
 
 
 
 
 
 
 
d1c0892
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import requests
from bs4 import BeautifulSoup
import time

import os
from langchain_huggingface import HuggingFaceEmbeddings
#from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


def search_jobs(query: str, num: int = 5):
    """根據職缺關鍵字,爬蟲取得104人力銀行上相符的職缺,並提供職缺相關資訊,包含職務描述、連結等。
    參數:
        query: 職缺搜尋關鍵字,例如 "資料分析"
        num: 需要的職缺數量 (預設為 5)
    輸出:
        會輸出 (num + 5) 個職缺,避免抓到是打廣告的職缺
        "title": 職務名稱
        "company": 公司
        "industry": 公司產業
        "location": 地點
        "experience": 經驗要求
        "education": 教育程度要求
        "salary": 薪資
        "description": 工作描述
        "job_link": 職務連結
        "company_link": 公司連結,
    """
    print('!! search_jobs() is called !!')

    # Construct search URL
    url = f"https://www.104.com.tw/jobs/search/?ro=0&keyword={query}&order=16"
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        print("Failed to retrieve page")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    job_list = []
    seen_jobs = set()

    # Extract job postings from the search results
    job_elements = soup.find_all("div", class_="col")

    for job in job_elements:
        # Extract job details
        title_elem = job.find("a", class_="info-job__text")
        company_elem = job.find("a", class_="info-company__text")
        industry_elem = job.find("span", class_="info-company-addon-type")
        location_elem = job.find(
            "a", attrs={"data-gtm-joblist": lambda x: x and "地區" in x})
        experience_elem = job.find(
            "a", attrs={"data-gtm-joblist": lambda x: x and "經歷" in x})
        education_elem = job.find(
            "a", attrs={"data-gtm-joblist": lambda x: x and "學歷" in x})
        salary_elem = job.find(
            "a", attrs={"data-gtm-joblist": lambda x: x and "薪資" in x})
        description_elem = job.find("div", class_="info-description")

        job_info = {
            "title": title_elem.text.strip() if title_elem else None,
            "company": company_elem.text.strip() if company_elem else None,
            "industry": industry_elem.text.strip() if industry_elem else None,
            "location": location_elem.text.strip() if location_elem else None,
            "experience":
            experience_elem.text.strip() if experience_elem else None,
            "education":
            education_elem.text.strip() if education_elem else None,
            "salary": salary_elem.text.strip() if salary_elem else None,
            "description":
            description_elem.text.strip() if description_elem else None,
            "job_link": title_elem["href"] if title_elem else None,
            "company_link": company_elem["href"] if company_elem else None,
        }

        # Avoid duplicate job postings
        if None not in job_info.values(
        ) and job_info["title"] not in seen_jobs:
            seen_jobs.add(job_info["title"])
            job_list.append(job_info)

        # Stop when enough job postings are collected
        if len(job_list) >= num + 5:
            break

    # Retrieve full job descriptions from individual job links
    for job in job_list:
        if job["job_link"]:
            try:
                job_response = requests.get(job["job_link"])
                if job_response.status_code == 200:
                    job_soup = BeautifulSoup(job_response.text, "html.parser")
                    description_div = job_soup.find(
                        "div", class_="job-description col-12")
                    job["description"] = "\n".join(
                        p.text.strip() for p in description_div.find_all(
                            "p")) if description_div else "Cannot retrieve"

                time.sleep(
                    1
                )  # Prevent excessive requests that may trigger IP blocking

            except Exception as e:
                print(f"Failed to fetch job description: {e}")
                job["description"] = "Cannot retrieve"

    return job_list


def retrieve_resume_info(query: str):
    """
    檢索履歷相關內容。
    當使用者詢問與履歷相關的問題時,例如「學歷」、「工作經歷」、「曾任職公司」、「職位名稱」、「專案經驗」、「作品展示」,此函數將從履歷資料庫中檢索最相似的幾句話。
    注意:
    - **此函數僅返回履歷中找到的原始內容,不會生成新答案**。
    - 若履歷中無相關資訊,則可能返回空結果。
    - **所有與個人履歷相關的問題(如「你在哪些公司工作過?」「請描述你的工作經歷」「你的學歷是?」),都應調用此函數進行檢索,而非讓模型自行生成答案**。
    參數:
    - `query (str)`: 使用者的提問,必須與履歷內容相關,例如:
    - 「你的學歷是什麼?」
    - 「你在哪些公司工作過?」
    - 「請用一句話描述你的工作經歷。」
    - 「你做過哪些專案?」
    - 「請提供你的作品集。」
    返回:
    - `list[str]`: 從履歷中檢索出的最相關句子(最多 5 條)。
    """

    print('!! retrieve_resume_info() is called !!')

    import os
    os.environ["HF_HOME"] = "/tmp/huggingface"
    os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"


    # Initialize embedding function
    cv_path = "./CV.docx"
    vector_dir = "./vector"
    model_name = "BAAI/bge-large-zh-v1.5"
    
    #embeddings = HuggingFaceEmbeddings(model_name=model_name)
    # 設置模型參數
    model_kwargs = {'device': 'cpu'}  # 如果有 GPU,改成 'cuda'
    encode_kwargs = {'normalize_embeddings': False}  # 是否標準化向量

    # 初始化 Hugging Face Embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )



    # Check if the vector database already exists
    if os.path.exists(vector_dir):
        db = Chroma(persist_directory=vector_dir,
                    embedding_function=embeddings)
    else:
        # Load the resume document
        loader = Docx2txtLoader(cv_path)
        data = loader.load()

        # Ensure `data` is a list and apply appropriate text splitting
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                                       chunk_overlap=100)
        docs = text_splitter.split_documents(
            data
        )  # Ensures Chroma stores small paragraphs instead of the entire resume

        # Create the vector database
        db = Chroma.from_documents(docs,
                                   embeddings,
                                   persist_directory=vector_dir)
        db.persist()

    # Set up the retriever
    retriever = db.as_retriever(search_kwargs={"k": 5})

    # Perform retrieval
    retrieved_docs = retriever.get_relevant_documents(query)

    # Return only the retrieved text
    return [doc.page_content for doc in retrieved_docs]