adaptive_rag / vectorization_implementation_steps.py
lanny xu
modify reranker
dbd527a
raw
history blame
22.5 kB
"""
ๆ–‡ๅญ—่ฝฌๅ‘้‡็š„ๅ…ทไฝ“ๅฎž็Žฐๆญฅ้ชค๏ผˆไปฃ็ ๅฑ‚้ข๏ผ‰
ๅฑ•็คบ HuggingFace Embeddings ๅ†…้ƒจ็š„ๅฎž้™…ๆ“ไฝœ
"""
print("=" * 80)
print("ๆ–‡ๅญ— โ†’ ๅ‘้‡็š„ๅ…ทไฝ“ๅฎž็Žฐๆญฅ้ชค")
print("=" * 80)
# ============================================================================
# ๅ‡†ๅค‡ๅทฅไฝœ๏ผšๆจกๆ‹ŸๅฎŒๆ•ด็š„ๅ‘้‡ๅŒ–่ฟ‡็จ‹
# ============================================================================
print("\n" + "=" * 80)
print("๐Ÿ”ง ๅ‡†ๅค‡๏ผšๅฎ‰่ฃ…ๅ’Œๅฏผๅ…ฅ้œ€่ฆ็š„ๅบ“")
print("=" * 80)
print("""
้œ€่ฆ็š„ๅบ“๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
pip install transformers torch sentence-transformers
ๅฏผๅ…ฅ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
""")
# ============================================================================
# Step 1: ๅŠ ่ฝฝๆจกๅž‹ๅ’Œๅˆ†่ฏๅ™จ
# ============================================================================
print("\n" + "=" * 80)
print("Step 1: ๅŠ ่ฝฝ้ข„่ฎญ็ปƒๆจกๅž‹ๅ’Œๅˆ†่ฏๅ™จ")
print("=" * 80)
print("""
ไปฃ็ ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
from transformers import AutoTokenizer, AutoModel
model_name = "sentence-transformers/all-MiniLM-L6-v2"
# 1. ๅŠ ่ฝฝๅˆ†่ฏๅ™จ๏ผˆ่ดŸ่ดฃๆ–‡ๅญ— โ†’ ID๏ผ‰
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 2. ๅŠ ่ฝฝๆจกๅž‹๏ผˆ่ดŸ่ดฃ ID โ†’ ๅ‘้‡๏ผ‰
model = AutoModel.from_pretrained(model_name)
model.eval() # ่ฎพ็ฝฎไธบ่ฏ„ไผฐๆจกๅผ๏ผˆไธ่ฎญ็ปƒ๏ผ‰
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
่ฟ™ไธคไธชไธœ่ฅฟๅšไป€ไนˆ๏ผŸ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
Tokenizer๏ผˆๅˆ†่ฏๅ™จ๏ผ‰๏ผš
โ”œโ”€ ่ฏๆฑ‡่กจ๏ผˆvocabulary๏ผ‰๏ผš30,000+ ไธช่ฏ
โ”‚ ไพ‹ๅฆ‚๏ผš{"hello": 1, "world": 2, "machine": 3456, ...}
โ””โ”€ ๅˆ†่ฏ่ง„ๅˆ™๏ผšๅฆ‚ไฝ•ๅˆ‡ๅˆ†ๆ–‡ๅญ—
Model๏ผˆๆจกๅž‹๏ผ‰๏ผš
โ”œโ”€ Embedding ๅฑ‚๏ผš่ฏๆฑ‡่กจ โ†’ ๅˆๅง‹ๅ‘้‡
โ”‚ 30,000 ร— 384 ็š„็Ÿฉ้˜ต๏ผˆๆฏไธช่ฏๅฏนๅบ”ไธ€ไธช 384 ็ปดๅ‘้‡๏ผ‰
โ”œโ”€ Transformer ๅฑ‚๏ผš6 ๅฑ‚ BERT encoder
โ”‚ ๆฏๅฑ‚้ƒฝๆœ‰ Self-Attention + Feed Forward
โ””โ”€ ๅ‚ๆ•ฐ้‡๏ผš22M๏ผˆ2200ไธ‡ไธชๆ•ฐๅญ—๏ผ‰
""")
# ============================================================================
# Step 2: ๅˆ†่ฏ๏ผˆTokenization๏ผ‰
# ============================================================================
print("\n" + "=" * 80)
print("Step 2: ๅˆ†่ฏ - ๆ–‡ๅญ—่ฝฌไธบ Token IDs")
print("=" * 80)
print("""
่พ“ๅ…ฅๆ–‡ๆœฌ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
text = "Machine learning is a subset of artificial intelligence"
ไปฃ็ ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
# ๅˆ†่ฏๅนถ่ฝฌๆขไธบๆจกๅž‹่พ“ๅ…ฅๆ ผๅผ
encoded_input = tokenizer(
text,
padding=True, # ๅกซๅ……ๅˆฐ็›ธๅŒ้•ฟๅบฆ
truncation=True, # ่ถ…้•ฟๆˆชๆ–ญ
max_length=512, # ๆœ€ๅคง้•ฟๅบฆ
return_tensors='pt' # ่ฟ”ๅ›ž PyTorch tensor
)
print(encoded_input)
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
่พ“ๅ‡บ๏ผˆencoded_input ๅŒ…ๅซ๏ผ‰๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
{
'input_ids': tensor([[
101, # [CLS] ็‰นๆฎŠๆ ‡่ฎฐ
3698, # "machine"
4083, # "learning"
2003, # "is"
1037, # "a"
2042, # "subset"
1997, # "of"
7976, # "artificial"
4454, # "intelligence"
102 # [SEP] ็‰นๆฎŠๆ ‡่ฎฐ
]]),
'attention_mask': tensor([[
1, 1, 1, 1, 1, 1, 1, 1, 1, 1 # ๆ‰€ๆœ‰ไฝ็ฝฎ้ƒฝๆœ‰ๆ•ˆ๏ผˆ1่กจ็คบๅ…ณๆณจ๏ผŒ0่กจ็คบๅฟฝ็•ฅ๏ผ‰
]])
}
่ฏฆ็ป†่งฃ้‡Š๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
input_ids:
ๆฏไธชๆ•ฐๅญ—ๅฏนๅบ”ไธ€ไธช่ฏ
101 = [CLS]๏ผˆๅฅๅญๅผ€ๅง‹ๆ ‡่ฎฐ๏ผ‰
3698 = "machine"
102 = [SEP]๏ผˆๅฅๅญ็ป“ๆŸๆ ‡่ฎฐ๏ผ‰
attention_mask:
ๅ‘Š่ฏ‰ๆจกๅž‹ๅ“ชไบ›ไฝ็ฝฎๆ˜ฏ็œŸๅฎžๅ†…ๅฎน๏ผˆ1๏ผ‰๏ผŒๅ“ชไบ›ๆ˜ฏๅกซๅ……๏ผˆ0๏ผ‰
ไพ‹ๅฆ‚๏ผš[1, 1, 1, 0, 0] ่กจ็คบๅ‰3ไธชๆ˜ฏ็œŸๅฎž่ฏ๏ผŒๅŽ2ไธชๆ˜ฏๅกซๅ……
""")
# ============================================================================
# Step 3: ้€š่ฟ‡ Embedding ๅฑ‚่Žทๅ–ๅˆๅง‹ๅ‘้‡
# ============================================================================
print("\n" + "=" * 80)
print("Step 3: Token IDs โ†’ ๅˆๅง‹ๅ‘้‡๏ผˆEmbedding ๅฑ‚๏ผ‰")
print("=" * 80)
print("""
่ฟ™ไธ€ๆญฅๅ‘็”Ÿๅœจๆจกๅž‹ๅ†…้ƒจ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
input_ids = [101, 3698, 4083, 2003, ...]
โ†“
Embedding ่กจๆŸฅ่ฏข
โ†“
Embedding ่กจ๏ผˆ็ฎ€ๅŒ–๏ผ‰๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
่ฟ™ๆ˜ฏไธ€ไธชๅทจๅคง็š„็Ÿฉ้˜ต๏ผš30,522 ร— 384
๏ผˆ30,522 ๆ˜ฏ่ฏๆฑ‡่กจๅคงๅฐ๏ผŒ384 ๆ˜ฏๅ‘้‡็ปดๅบฆ๏ผ‰
ID | ็ฌฌ1็ปด ็ฌฌ2็ปด ็ฌฌ3็ปด ... ็ฌฌ384็ปด
โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
101 | 0.12 -0.34 0.56 ... 0.78 โ† [CLS]
3698 | 0.23 0.45 -0.67 ... 0.89 โ† "machine"
4083 | 0.34 -0.56 0.78 ... -0.90 โ† "learning"
2003 | 0.45 0.67 -0.89 ... 0.12 โ† "is"
...
ๆŸฅ่ฏข่ฟ‡็จ‹๏ผˆ็ฑปไผผๅญ—ๅ…ธๆŸฅ่ฏข๏ผ‰๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ID 101 โ†’ ๆŸฅ่กจ โ†’ [0.12, -0.34, 0.56, ..., 0.78]
ID 3698 โ†’ ๆŸฅ่กจ โ†’ [0.23, 0.45, -0.67, ..., 0.89]
ID 4083 โ†’ ๆŸฅ่กจ โ†’ [0.34, -0.56, 0.78, ..., -0.90]
...
็ป“ๆžœ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
token_embeddings = [
[0.12, -0.34, 0.56, ..., 0.78], # [CLS]
[0.23, 0.45, -0.67, ..., 0.89], # "machine"
[0.34, -0.56, 0.78, ..., -0.90], # "learning"
[0.45, 0.67, -0.89, ..., 0.12], # "is"
...
]
ๅฝข็Šถ๏ผš(10, 384) # 10 ไธช tokens๏ผŒๆฏไธช 384 ็ปด
โš ๏ธ ๆณจๆ„๏ผš่ฟ™ไบ›่ฟ˜ไธๆ˜ฏๆœ€็ปˆๅ‘้‡๏ผ้œ€่ฆ้€š่ฟ‡ Transformer ๅค„็†๏ผ
""")
# ============================================================================
# Step 4: Transformer ๅค„็†๏ผˆๆ ธๅฟƒ๏ผ๏ผ‰
# ============================================================================
print("\n" + "=" * 80)
print("Step 4: Transformer ๅค„็† - Self-Attention๏ผˆๆ ธๅฟƒๆญฅ้ชค๏ผ‰")
print("=" * 80)
print("""
ไปฃ็ ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
with torch.no_grad(): # ไธ่ฎก็ฎ—ๆขฏๅบฆ๏ผˆไธ่ฎญ็ปƒ๏ผ‰
outputs = model(**encoded_input)
# outputs.last_hidden_state ๅฐฑๆ˜ฏ Transformer ็š„่พ“ๅ‡บ
token_embeddings = outputs.last_hidden_state
print(token_embeddings.shape) # torch.Size([1, 10, 384])
# ๆ‰นๆฌก tokens ็ปดๅบฆ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
Transformer ๅ†…้ƒจๅšไบ†ไป€ไนˆ๏ผŸ๏ผˆ6 ๅฑ‚ๅค„็†๏ผ‰
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
่พ“ๅ…ฅ๏ผšๅˆๅง‹ embeddings
[CLS]: [0.12, -0.34, 0.56, ...]
machine: [0.23, 0.45, -0.67, ...]
learning: [0.34, -0.56, 0.78, ...]
is: [0.45, 0.67, -0.89, ...]
...
โ†“
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ Layer 1: Self-Attention โ”‚
โ”‚ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ โ”‚
โ”‚ โ”‚
โ”‚ ๆฏไธช่ฏ"็œ‹"ๅ…ถไป–ๆ‰€ๆœ‰่ฏ๏ผŒๆ›ดๆ–ฐ่‡ชๅทฑ็š„ๅ‘้‡๏ผš โ”‚
โ”‚ โ”‚
โ”‚ "machine" ็œ‹ๅˆฐ "learning" โ†’ ็†่งฃ่ฟ™ๆ˜ฏไธ€ไธช่ฏ็ป„ โ”‚
โ”‚ "learning" ็œ‹ๅˆฐ "artificial" โ†’ ็†่งฃไธŽAI็›ธๅ…ณ โ”‚
โ”‚ "is" ็œ‹ๅˆฐๅ‰ๅŽ่ฏ โ†’ ็†่งฃๆ˜ฏ่ฟžๆŽฅ่ฏ โ”‚
โ”‚ โ”‚
โ”‚ ๆ›ดๆ–ฐๅŽ็š„ๅ‘้‡ๅŒ…ๅซไบ†ไธŠไธ‹ๆ–‡ไฟกๆฏ โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ†“
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ Layer 2: Self-Attention โ”‚
โ”‚ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ โ”‚
โ”‚ ็ปง็ปญๆทฑๅŒ–็†่งฃ... โ”‚
โ”‚ "machine learning" ไฝœไธบๆ•ดไฝ“็†่งฃ โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ†“
... (Layer 3, 4, 5) ...
โ†“
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ Layer 6: Self-Attention (ๆœ€ๅŽไธ€ๅฑ‚) โ”‚
โ”‚ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ โ”‚
โ”‚ ๆฏไธช่ฏ็š„ๅ‘้‡็ŽฐๅœจๅŒ…ๅซไบ†๏ผš โ”‚
โ”‚ - ่‡ชๅทฑ็š„่ฏญไน‰ โ”‚
โ”‚ - ไธŠไธ‹ๆ–‡ไฟกๆฏ โ”‚
โ”‚ - ๆ•ดไธชๅฅๅญ็š„ๅซไน‰ โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ†“
ๆœ€็ปˆ่พ“ๅ‡บ๏ผš
[CLS]: [0.234, 0.567, -0.890, ...] # ๆ›ดๆ–ฐๅŽ๏ผŒๅŒ…ๅซๅ…จๅฅไฟกๆฏ
machine: [0.345, -0.678, 0.123, ...] # ๅŒ…ๅซ "learning" ็š„ไฟกๆฏ
learning: [0.456, 0.789, -0.234, ...] # ๅŒ…ๅซ "machine" ็š„ไฟกๆฏ
...
ๅฝข็Šถ๏ผš(1, 10, 384)
ๆ‰นๆฌก tokens ็ปดๅบฆ
""")
# ============================================================================
# Step 5: Mean Pooling - ๅˆๅนถๆˆไธ€ไธชๅฅๅญๅ‘้‡
# ============================================================================
print("\n" + "=" * 80)
print("Step 5: Mean Pooling - ๆŠŠๅคšไธช่ฏๅ‘้‡ๅˆๅนถๆˆไธ€ไธชๅฅๅญๅ‘้‡")
print("=" * 80)
print("""
้—ฎ้ข˜๏ผš็Žฐๅœจๆœ‰ 10 ไธช่ฏ๏ผŒๆฏไธช่ฏไธ€ไธชๅ‘้‡
ๅฆ‚ไฝ•ๅ˜ๆˆ 1 ไธชๅฅๅญๅ‘้‡๏ผŸ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ไปฃ็ ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
def mean_pooling(token_embeddings, attention_mask):
\"\"\"
ๅฏนๆ‰€ๆœ‰่ฏๅ‘้‡ๆฑ‚ๅนณๅ‡๏ผˆ่€ƒ่™‘ attention_mask๏ผ‰
\"\"\"
# token_embeddings: (1, 10, 384)
# attention_mask: (1, 10)
# ๆ‰ฉๅฑ• mask ็š„็ปดๅบฆไปฅๅŒน้… embeddings
# (1, 10) โ†’ (1, 10, 1) โ†’ (1, 10, 384)
input_mask_expanded = attention_mask.unsqueeze(-1).expand(
token_embeddings.size()
).float()
# ๅฐ† embeddings ไธŽ mask ็›ธไน˜๏ผˆๅฟฝ็•ฅๅกซๅ……้ƒจๅˆ†๏ผ‰
# ็„ถๅŽๅฏนๆ‰€ๆœ‰่ฏๆฑ‚ๅ’Œ
sum_embeddings = torch.sum(
token_embeddings * input_mask_expanded,
dim=1 # ๅœจ token ็ปดๅบฆๆฑ‚ๅ’Œ
)
# ่ฎก็ฎ—ๆœ‰ๆ•ˆ token ็š„ๆ•ฐ้‡
sum_mask = torch.clamp(
input_mask_expanded.sum(dim=1),
min=1e-9 # ้ฟๅ…้™ค้›ถ
)
# ๆฑ‚ๅนณๅ‡
mean_embeddings = sum_embeddings / sum_mask
return mean_embeddings
# ไฝฟ็”จ
sentence_embedding = mean_pooling(
token_embeddings,
encoded_input['attention_mask']
)
print(sentence_embedding.shape) # torch.Size([1, 384])
# ๆ‰นๆฌก ็ปดๅบฆ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ๅ…ทไฝ“่ฎก็ฎ—๏ผˆ็ฎ€ๅŒ–็คบไพ‹๏ผ‰๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
10 ไธช่ฏๅ‘้‡๏ผŒๆฏไธช 384 ็ปด๏ผš
Token 1: [0.234, 0.567, -0.890, ..., 0.123]
Token 2: [0.345, -0.678, 0.123, ..., 0.234]
Token 3: [0.456, 0.789, -0.234, ..., 0.345]
...
Token 10: [0.567, 0.890, 0.345, ..., 0.456]
ๆฑ‚ๅนณๅ‡๏ผˆๅฏนๆฏไธ€็ปดๅˆ†ๅˆซๅนณๅ‡๏ผ‰๏ผš
็ฌฌ1็ปด: (0.234 + 0.345 + 0.456 + ... + 0.567) / 10 = 0.412
็ฌฌ2็ปด: (0.567 - 0.678 + 0.789 + ... + 0.890) / 10 = 0.523
็ฌฌ3็ปด: (-0.890 + 0.123 - 0.234 + ... + 0.345) / 10 = -0.089
...
็ฌฌ384็ปด: (0.123 + 0.234 + 0.345 + ... + 0.456) / 10 = 0.289
ๅฅๅญๅ‘้‡ = [0.412, 0.523, -0.089, ..., 0.289] (384็ปด)
""")
# ============================================================================
# Step 6: ๅฝ’ไธ€ๅŒ–๏ผˆNormalization๏ผ‰
# ============================================================================
print("\n" + "=" * 80)
print("Step 6: L2 ๅฝ’ไธ€ๅŒ– - ๅฐ†ๅ‘้‡้•ฟๅบฆ็ผฉๆ”พๅˆฐ 1")
print("=" * 80)
print("""
ไปฃ็ ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
import torch.nn.functional as F
# L2 ๅฝ’ไธ€ๅŒ–
sentence_embedding = F.normalize(
sentence_embedding,
p=2, # L2 ่Œƒๆ•ฐ
dim=1 # ๅœจ็‰นๅพ็ปดๅบฆๅฝ’ไธ€ๅŒ–
)
print(sentence_embedding.shape) # torch.Size([1, 384])
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ๅฝ’ไธ€ๅŒ–็š„ไฝœ็”จ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ๅฝ’ไธ€ๅŒ–ๅ‰็š„ๅ‘้‡๏ผš
v = [0.412, 0.523, -0.089, ..., 0.289]
้•ฟๅบฆ ||v|| = โˆš(0.412ยฒ + 0.523ยฒ + ... + 0.289ยฒ) = 2.37
ๅฝ’ไธ€ๅŒ–ๅŽ็š„ๅ‘้‡๏ผš
v_norm = v / ||v||
v_norm = [0.412/2.37, 0.523/2.37, ..., 0.289/2.37]
= [0.174, 0.221, -0.038, ..., 0.122]
้•ฟๅบฆ ||v_norm|| = 1 โœ“
ๅฅฝๅค„๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
โœ… ๆ‰€ๆœ‰ๅ‘้‡้•ฟๅบฆ็›ธๅŒ๏ผˆ้ƒฝๆ˜ฏ1๏ผ‰๏ผŒๆ–นไพฟๆฏ”่พƒ
โœ… ไฝ™ๅผฆ็›ธไผผๅบฆ = ็‚น็งฏ๏ผˆ่ฎก็ฎ—ๆ›ดๅฟซ๏ผ‰
cos_sim(a, b) = aยทb / (||a|| ร— ||b||)
ๅฆ‚ๆžœๅฝ’ไธ€ๅŒ–: cos_sim(a, b) = aยทb โ† ็ฎ€ๅŒ–ไบ†๏ผ
โœ… ๆถˆ้™คๅ‘้‡้•ฟๅบฆ็š„ๅฝฑๅ“๏ผŒๅชๅ…ณๆณจๆ–นๅ‘
""")
# ============================================================================
# Step 7: ๆœ€็ปˆ่พ“ๅ‡บ
# ============================================================================
print("\n" + "=" * 80)
print("Step 7: ๅพ—ๅˆฐๆœ€็ปˆ็š„ๅฅๅญๅ‘้‡")
print("=" * 80)
print("""
ๆœ€็ปˆ็ป“ๆžœ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
# ่ฝฌๆขไธบ numpy ๆ•ฐ็ป„๏ผˆๆ–นไพฟไฝฟ็”จ๏ผ‰
final_vector = sentence_embedding.cpu().numpy()[0]
print(final_vector.shape) # (384,)
print(final_vector[:5]) # ๅ‰5ไธชๆ•ฐๅญ—
# [0.174, 0.221, -0.038, 0.095, 0.312]
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
่ฟ™ๅฐฑๆ˜ฏๆœ€็ปˆ็š„ๅฅๅญๅ‘้‡๏ผ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
่พ“ๅ…ฅ: "Machine learning is a subset of artificial intelligence"
่พ“ๅ‡บ: [0.174, 0.221, -0.038, ..., 0.122] (384 ไธชๆ•ฐๅญ—)
่ฟ™ไธชๅ‘้‡ๅŒ…ๅซไบ†๏ผš
โœ… ๆฏไธช่ฏ็š„่ฏญไน‰
โœ… ่ฏไธŽ่ฏไน‹้—ด็š„ๅ…ณ็ณป
โœ… ๆ•ดไธชๅฅๅญ็š„ๅซไน‰
ๅฏไปฅ็”จๆฅ๏ผš
โœ… ่ฎก็ฎ—ไธŽๅ…ถไป–ๅฅๅญ็š„็›ธไผผๅบฆ
โœ… ๅญ˜ๅ…ฅๅ‘้‡ๆ•ฐๆฎๅบ“
โœ… ่ฟ›่กŒ่ฏญไน‰ๆœ็ดข
""")
# ============================================================================
# ๅฎŒๆ•ดไปฃ็ ๆฑ‡ๆ€ป
# ============================================================================
print("\n" + "=" * 80)
print("๐Ÿ“ ๅฎŒๆ•ดไปฃ็ ๆฑ‡ๆ€ป๏ผˆๅฎž้™…ๅฏ่ฟ่กŒ๏ผ‰")
print("=" * 80)
print("""
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
def text_to_vector(text):
\"\"\"
ๅฎŒๆ•ด็š„ๆ–‡ๅญ—่ฝฌๅ‘้‡ๆต็จ‹
\"\"\"
# Step 1: ๅŠ ่ฝฝๆจกๅž‹
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()
# Step 2: ๅˆ†่ฏ
encoded_input = tokenizer(
text,
padding=True,
truncation=True,
max_length=512,
return_tensors='pt'
)
# Step 3 & 4: ้€š่ฟ‡ๆจกๅž‹๏ผˆEmbedding + Transformer๏ผ‰
with torch.no_grad():
outputs = model(**encoded_input)
token_embeddings = outputs.last_hidden_state
# Step 5: Mean Pooling
attention_mask = encoded_input['attention_mask']
input_mask_expanded = attention_mask.unsqueeze(-1).expand(
token_embeddings.size()
).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
sentence_embedding = sum_embeddings / sum_mask
# Step 6: ๅฝ’ไธ€ๅŒ–
sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)
# Step 7: ่ฝฌไธบ numpy
return sentence_embedding.cpu().numpy()[0]
# ไฝฟ็”จ็คบไพ‹๏ผš
text = "Machine learning is a subset of artificial intelligence"
vector = text_to_vector(text)
print(f"่พ“ๅ…ฅ: {text}")
print(f"ๅ‘้‡็ปดๅบฆ: {vector.shape}") # (384,)
print(f"ๅ‰10ไธชๆ•ฐๅญ—: {vector[:10]}")
print(f"ๅ‘้‡้•ฟๅบฆ: {np.linalg.norm(vector)}") # ๅบ”่ฏฅๆ˜ฏ 1.0
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ไฝ ็š„้กน็›ฎไธญ็š„็ฎ€ๅŒ–่ฐƒ็”จ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
vector = embeddings.embed_query(text)
# โ†‘ ่ฟ™ไธ€่กŒๅ†…้ƒจๆ‰ง่กŒไบ†ไธŠ้ขๆ‰€ๆœ‰ 7 ไธชๆญฅ้ชค๏ผ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
""")
# ============================================================================
# ๅ…ณ้”ฎๆญฅ้ชคๆ—ถ้—ดๅˆ†ๆž
# ============================================================================
print("\n" + "=" * 80)
print("โฑ๏ธ ๅ„ๆญฅ้ชค่€—ๆ—ถๅˆ†ๆž")
print("=" * 80)
print("""
ๅ‡่ฎพๅค„็†ไธ€ไธชๅฅๅญ๏ผˆ10ไธช่ฏ๏ผ‰๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
Step 1: ๅŠ ่ฝฝๆจกๅž‹ 0.5-2็ง’ (ๅช้œ€ไธ€ๆฌก๏ผŒๅฏๅค็”จ)
Step 2: ๅˆ†่ฏ <1ๆฏซ็ง’ (้žๅธธๅฟซ)
Step 3: Embedding ๆŸฅ่กจ <1ๆฏซ็ง’ (็Ÿฉ้˜ต็ดขๅผ•)
Step 4: Transformer ๅค„็† 10-50ๆฏซ็ง’ (6ๅฑ‚่ฎก็ฎ—๏ผŒๆœ€ๆ…ข)
Step 5: Mean Pooling <1ๆฏซ็ง’ (็ฎ€ๅ•ๅนณๅ‡)
Step 6: ๅฝ’ไธ€ๅŒ– <1ๆฏซ็ง’ (็ฎ€ๅ•้™คๆณ•)
Step 7: ่ฝฌๆขๆ ผๅผ <1ๆฏซ็ง’
ๆ€ป่€—ๆ—ถ: 10-50ๆฏซ็ง’ (GPU) ๆˆ– 50-200ๆฏซ็ง’ (CPU)
ๆ‰น้‡ๅค„็†๏ผˆ20ไธชๅฅๅญ๏ผ‰:
ๅ•ไธชๅค„็†: 20 ร— 50ms = 1000ms
ๆ‰น้‡ๅค„็†: 100ms โ† ๅฟซ10ๅ€๏ผ(GPUๅนถ่กŒ)
่ฟ™ๅฐฑๆ˜ฏไธบไป€ไนˆ่ฆๆ‰น้‡ๅ‘้‡ๅŒ–๏ผ
""")
print("\n" + "=" * 80)
print("โœ… ๆ–‡ๅญ—่ฝฌๅ‘้‡็š„ๅฎž็Žฐๆญฅ้ชค่ฎฒ่งฃๅฎŒๆฏ•๏ผ")
print("=" * 80)
print("""
ๆ ธๅฟƒๆญฅ้ชคๅ›ž้กพ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ๆ–‡ๅญ—
โ†“ Step 1: ๅŠ ่ฝฝๆจกๅž‹
Tokenizer + Model
โ†“ Step 2: ๅˆ†่ฏ
Token IDs: [101, 3698, 4083, ...]
โ†“ Step 3: Embedding ๆŸฅ่กจ
ๅˆๅง‹ๅ‘้‡: [(10, 384)]
โ†“ Step 4: Transformer ๅค„็†
ๆ›ดๆ–ฐๅ‘้‡: [(10, 384)] ๅŒ…ๅซไธŠไธ‹ๆ–‡ไฟกๆฏ
โ†“ Step 5: Mean Pooling
ๅฅๅญๅ‘้‡: [(1, 384)]
โ†“ Step 6: ๅฝ’ไธ€ๅŒ–
ๅฝ’ไธ€ๅŒ–ๅ‘้‡: [(1, 384)] ้•ฟๅบฆ=1
โ†“ Step 7: ่พ“ๅ‡บ
ๆœ€็ปˆๅ‘้‡: [0.174, 0.221, ..., 0.122]
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
็Žฐๅœจไฝ ็Ÿฅ้“ไบ†ๆฏไธ€ๆญฅ็š„ๅ…ทไฝ“ๆ“ไฝœ๏ผ
""")
print()