Spaces:

jonathanjordan21
/

CV_Parser

Sleeping

App Files Files Community

jonathanjordan21 commited on Sep 17

Commit

6c402fd

verified ·

1 Parent(s): 5ab73a5

Create parser.py

Browse files

Files changed (1) hide show

parser.py +66 -0

parser.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer
+import json
+model_name = "Qwen/Qwen3-0.6B"
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto"
+)
+parsing = [
+    {"name":"education", "type":"List[str]","description":"attended school, university, and other education programs"},
+    {"name":"experience", "type":"float", "description":"years of experience"},
+    {"name":"skills", "type":"List[str]", "description":"list of skills"},
+    {"name":"name", "type":"str", "description":"name of the person"},
+    {"name":"location", "type":"str", "description":"location of the person"},
+    {"name":"email", "type":"str", "description":"email of the person"},
+    {"name":"websites", "type":"List[str]", "description":"urls related of the person"},
+    {"name":"certifications", "type":"List[str]", "description":"list of certifications"},
+    {"name":"languages", "type":"List[str]", "description":"list of languages"},
+    {"name":"projects", "type":"List[str]", "description":"list of projects"},
+    {"name":"note", "type":"str", "description":"additional note which highlight the best or uniqueness of the person"}
+]
+def parse_resume(parsing, resume):
+    format_parsing = [f"{x['name']} : {x['type']} = {x['description']}\n" for x in parsing]
+    prompt = f"""Based on the below resume, tell me the summary details of skills, name, experience years, education, etc in short
+    The Output must be the JSON object with the following format:
+    {format_parsing}
+    RESUME:\n""" + resume
+    messages = [
+        {"role": "user", "content": prompt}
+    ]
+    text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
+    )
+    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    # conduct text completion
+    generated_ids = model.generate(
+        **model_inputs,
+        max_new_tokens=32768
+    )
+    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+    # parsing thinking content
+    try:
+        # rindex finding 151668 (</think>)
+        index = len(output_ids) - output_ids[::-1].index(151668)
+    except ValueError:
+        index = 0
+    thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
+    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
+    return thinking_content, json.loads(content)