Spaces:
Paused
Paused
File size: 11,146 Bytes
399f3c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 |
# 向量重排(Vector Reranking)原理详解
## 🎯 什么是向量重排
向量重排是检索增强生成(RAG)系统中的一种高级技术,用于在初始向量检索后对候选文档进行重新排序,以提高最终检索结果的质量和相关性。
## 🔍 为什么需要重排
### 初始检索的局限性
1. **语义距离偏差**
- 向量相似度可能无法完全捕捉语义相关性
- 某些相关文档可能因为表达方式不同而排名靠后
2. **上下文理解不足**
- 简单的余弦相似度无法理解复杂的查询意图
- 缺乏对查询和文档交互关系的深度理解
3. **多样性问题**
- 初始检索可能返回内容相似的重复文档
- 缺乏结果的多样性和全面性
## 🧠 重排的核心原理
### 1. 双阶段检索架构
```
查询 → 粗排(向量检索)→ 精排(重排模型)→ 最终结果
↓ ↓
召回候选集 重新排序打分
(100-1000篇) (选择前k篇)
```
### 2. 重排模型类型
#### A. 交叉编码器(Cross-Encoder)
```python
# 原理示意
def cross_encoder_rerank(query, documents):
scores = []
for doc in documents:
# 查询和文档一起编码
input_text = f"[CLS] {query} [SEP] {doc} [SEP]"
score = model(input_text) # 直接输出相关性分数
scores.append(score)
return sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
```
#### B. 双编码器重排(Bi-Encoder Reranking)
```python
def bi_encoder_rerank(query, documents):
query_embedding = query_encoder(query)
doc_embeddings = [doc_encoder(doc) for doc in documents]
# 使用更复杂的相似度计算
scores = []
for doc_emb in doc_embeddings:
score = complex_similarity(query_embedding, doc_emb)
scores.append(score)
return sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
```
## 🔬 重排算法详解
### 1. 基于机器学习的重排
#### Learning to Rank (LTR)
```python
class LearnToRankReranker:
def __init__(self):
self.model = None # XGBoost, LambdaMART等
def extract_features(self, query, document):
"""提取查询-文档特征"""
features = [
# 文本匹配特征
jaccard_similarity(query, document),
tf_idf_score(query, document),
bm25_score(query, document),
# 语义特征
cosine_similarity(query_emb, doc_emb),
bert_score(query, document),
# 文档特征
document_length(document),
document_quality_score(document),
# 查询特征
query_complexity(query),
query_type_classification(query)
]
return features
def rerank(self, query, documents):
features_matrix = []
for doc in documents:
features = self.extract_features(query, doc)
features_matrix.append(features)
scores = self.model.predict(features_matrix)
return sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
```
### 2. 基于深度学习的重排
#### Transformer重排模型
```python
class TransformerReranker:
def __init__(self, model_name="microsoft/DialoGPT-medium"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
def rerank(self, query, documents, top_k=5):
scores = []
for doc in documents:
# 构造输入
inputs = self.tokenizer(
query, doc,
padding=True,
truncation=True,
max_length=512,
return_tensors="pt"
)
# 获取相关性分数
with torch.no_grad():
outputs = self.model(**inputs)
score = torch.softmax(outputs.logits, dim=-1)[0][1] # 相关性概率
scores.append(score.item())
# 重新排序
ranked_results = sorted(
zip(documents, scores),
key=lambda x: x[1],
reverse=True
)
return ranked_results[:top_k]
```
### 3. 多策略融合重排
```python
class MultiStrategyReranker:
def __init__(self):
self.semantic_weight = 0.4
self.lexical_weight = 0.3
self.diversity_weight = 0.2
self.freshness_weight = 0.1
def rerank(self, query, documents):
# 1. 语义相关性分数
semantic_scores = self.compute_semantic_scores(query, documents)
# 2. 词汇匹配分数
lexical_scores = self.compute_lexical_scores(query, documents)
# 3. 多样性分数
diversity_scores = self.compute_diversity_scores(documents)
# 4. 时效性分数
freshness_scores = self.compute_freshness_scores(documents)
# 5. 加权融合
final_scores = []
for i in range(len(documents)):
score = (
self.semantic_weight * semantic_scores[i] +
self.lexical_weight * lexical_scores[i] +
self.diversity_weight * diversity_scores[i] +
self.freshness_weight * freshness_scores[i]
)
final_scores.append(score)
return sorted(zip(documents, final_scores), key=lambda x: x[1], reverse=True)
```
## 🎛️ 重排特征工程
### 1. 文本匹配特征
```python
def extract_text_features(query, document):
return {
# 精确匹配
'exact_match_ratio': exact_match_count(query, document) / len(query.split()),
# 模糊匹配
'fuzzy_match_score': fuzz.ratio(query, document) / 100,
# N-gram重叠
'bigram_overlap': ngram_overlap(query, document, n=2),
'trigram_overlap': ngram_overlap(query, document, n=3),
# TF-IDF相似度
'tfidf_similarity': tfidf_cosine_similarity(query, document),
# BM25分数
'bm25_score': compute_bm25(query, document)
}
```
### 2. 语义特征
```python
def extract_semantic_features(query, document, embeddings):
query_emb = embeddings['query']
doc_emb = embeddings['document']
return {
# 余弦相似度
'cosine_similarity': cosine_sim(query_emb, doc_emb),
# 欧几里得距离
'euclidean_distance': euclidean_distance(query_emb, doc_emb),
# 曼哈顿距离
'manhattan_distance': manhattan_distance(query_emb, doc_emb),
# BERT分数
'bert_score': bert_score_f1(query, document),
# 语义角度
'semantic_angle': semantic_angle(query_emb, doc_emb)
}
```
### 3. 文档质量特征
```python
def extract_quality_features(document):
return {
# 长度特征
'doc_length': len(document.split()),
'sentence_count': len(sent_tokenize(document)),
# 可读性特征
'readability_score': textstat.flesch_reading_ease(document),
'complexity_score': textstat.flesch_kincaid_grade(document),
# 信息密度
'unique_word_ratio': len(set(document.split())) / len(document.split()),
'stopword_ratio': stopword_count(document) / len(document.split()),
# 结构特征
'has_headers': bool(re.search(r'^#+\s', document, re.MULTILINE)),
'has_lists': bool(re.search(r'^\s*[-*+]\s', document, re.MULTILINE))
}
```
## 🚀 实际应用示例
### 集成到RAG系统中
```python
class AdaptiveRAGWithReranking:
def __init__(self):
self.initial_retriever = VectorRetriever()
self.reranker = TransformerReranker()
self.generator = LanguageModel()
def query(self, question, top_k=5, rerank_candidates=20):
# 1. 初始检索(获取更多候选)
initial_docs = self.initial_retriever.retrieve(
question,
top_k=rerank_candidates
)
# 2. 重排
reranked_docs = self.reranker.rerank(
question,
initial_docs,
top_k=top_k
)
# 3. 生成答案
context = "\n\n".join([doc[0] for doc in reranked_docs])
answer = self.generator.generate(question, context)
return {
'answer': answer,
'sources': reranked_docs,
'confidence': self.calculate_confidence(reranked_docs)
}
```
## 📊 性能评估指标
### 1. 排序质量指标
```python
def evaluate_reranking(original_ranking, reranked_results, ground_truth):
return {
# NDCG (Normalized Discounted Cumulative Gain)
'ndcg@5': ndcg_score(ground_truth, reranked_results, k=5),
'ndcg@10': ndcg_score(ground_truth, reranked_results, k=10),
# MAP (Mean Average Precision)
'map': mean_average_precision(ground_truth, reranked_results),
# MRR (Mean Reciprocal Rank)
'mrr': mean_reciprocal_rank(ground_truth, reranked_results),
# 排序改进度
'ranking_improvement': kendall_tau(original_ranking, reranked_results)
}
```
### 2. 端到端效果评估
```python
def evaluate_rag_with_reranking(test_questions, ground_truth_answers):
results = []
for question, gt_answer in zip(test_questions, ground_truth_answers):
# 无重排
original_answer = rag_without_rerank(question)
# 有重排
reranked_answer = rag_with_rerank(question)
results.append({
'question': question,
'original_score': evaluate_answer(original_answer, gt_answer),
'reranked_score': evaluate_answer(reranked_answer, gt_answer),
'improvement': evaluate_answer(reranked_answer, gt_answer) -
evaluate_answer(original_answer, gt_answer)
})
return results
```
## 💡 最佳实践
### 1. 重排策略选择
- **实时性要求高**: 使用轻量级规则或简单ML模型
- **精度要求高**: 使用深度学习重排模型
- **平衡性能**: 多策略融合 + 缓存优化
### 2. 特征选择原则
- **相关性特征**: 语义相似度、词汇匹配
- **质量特征**: 文档权威性、完整性
- **多样性特征**: 避免结果冗余
- **时效性特征**: 信息新鲜度
### 3. 系统优化
```python
class OptimizedReranker:
def __init__(self):
self.cache = LRUCache(maxsize=1000)
self.batch_size = 32
@lru_cache(maxsize=1000)
def cached_rerank(self, query_hash, doc_hashes):
"""缓存重排结果"""
pass
def batch_rerank(self, queries, documents):
"""批量重排优化"""
pass
```
重排向量是提升RAG系统检索精度的关键技术,通过多层次的相关性评估和智能排序,显著提高了最终答案的质量和准确性。 |