Question about transformers implemention

#2
by wangchuan98 - opened

非常棒的工作!我希望用transformers框架实现向量推理,发现结果和Sentence-transformers有较大差异。以下是我的实现代码,想知道哪个部分有问题?

#Sentence-transformers实现
def get_prompteol_input(text: str) -> str:
return f"This sentence: <|im_start|>“{text}” means in one word: “"

def get_detailed_instruct(task_description: str, query: str) -> str:
return f'Instruct: {task_description}\nQuery:{query}'

model = SentenceTransformer(
"/home/jovyan/mnt-xck-notebook-zzzc/pretrained_models/Kingsoft-LLM/QZhou-Embedding-Zh",
model_kwargs={"device_map": "cuda", "trust_remote_code": True, "torch_dtype": torch.bfloat16},
tokenizer_kwargs={"padding_side": "left", "trust_remote_code": True},
trust_remote_code=True,
)

task= "Given a web search query, retrieve relevant passages that answer the query"
queries = [
get_prompteol_input(get_detailed_instruct(task, "光合作用是什么?")),
get_prompteol_input(get_detailed_instruct(task, "电话是谁发明的?"))
]

documents = [
get_prompteol_input("光合作用是绿色植物利用阳光、二氧化碳和水生成葡萄糖和氧气的过程。这一生化反应发生在叶绿体中。"),
get_prompteol_input("亚历山大·格拉汉姆·贝尔(Alexander Graham Bell)因于1876年发明了第一台实用电话而广受认可,并为此设备获得了美国专利第174,465号。")
]

query_embeddings = model.encode(queries, normalize_embeddings=False)
document_embeddings = model.encode(documents, normalize_embeddings=False)

dim=1792 # 128, 256, 512, 768, 1024, 1280, 1536, 1792
query_embeddings = normalize(query_embeddings[:, :dim])
document_embeddings = normalize(document_embeddings[:, :dim])

similarity = model.similarity(query_embeddings, document_embeddings)
print(f'cos_sim:{similarity}')

#transformers实现
llm = AutoModel.from_pretrained('/home/jovyan/mnt-xck-notebook-zzzc/pretrained_models/Kingsoft-LLM/QZhou-Embedding-Zh', torch_dtype=torch.bfloat16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained('/home/jovyan/mnt-xck-notebook-zzzc/pretrained_models/Kingsoft-LLM/QZhou-Embedding-Zh', padding_side='left')
mlp = nn.Linear(4096, 1792, dtype=torch.bfloat16)
state_dict = load_file('/home/jovyan/mnt-xck-notebook-zzzc/pretrained_models/Kingsoft-LLM/QZhou-Embedding-Zh/2_Dense/model_bak.safetensors')
mlp.load_state_dict(state_dict)

llm = llm.to("cuda")
mlp = mlp.to("cuda")
query_inputs = tokenizer(queries, padding=True, truncation=True, return_tensors="pt").to("cuda")
doc_inputs = tokenizer(documents, padding=True, truncation=True, return_tensors="pt").to("cuda")

llm.eval()
mlp.eval()
with torch.no_grad():
query_emb = normalize(mlp(llm(**query_inputs).last_hidden_state[:, -1]).detach().cpu().float().numpy())
doc_emb = normalize(mlp(llm(**doc_inputs).last_hidden_state[:, -1]).detach().cpu().float().numpy())

很抱歉刚刚才抽时间看你的代码,我看到你的第二个实现是在transformers框架下做的,将后映射模块单独写了个linear。一般情况下embedding模型有了额外的sentence_transformers模块后(pooling、dense),建议使用sentence_transformers自身的链路加载,就是第一种实现。第二种实现我们也没有尝试过,估计是dense tensor加载的实现不一样,你看下nn.Linear、load_file那几行是不是真和sentence_transformers内部逻辑对上了😂😂😂

Sign up or log in to comment