Spaces:
Paused
Paused
lanny xu
commited on
Commit
·
be297c2
1
Parent(s):
db5bfaa
modify reranker
Browse files- reranker.py +22 -3
reranker.py
CHANGED
|
@@ -39,7 +39,12 @@ class TFIDFReranker(DocumentReranker):
|
|
| 39 |
def __init__(self):
|
| 40 |
super().__init__()
|
| 41 |
self.name = "TFIDFReranker"
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
def rerank(self, query: str, documents: List[dict], top_k: int = 5) -> List[Tuple[dict, float]]:
|
| 45 |
"""使用TF-IDF重新排序文档"""
|
|
@@ -77,8 +82,22 @@ class BM25Reranker(DocumentReranker):
|
|
| 77 |
self.b = b
|
| 78 |
|
| 79 |
def _tokenize(self, text: str) -> List[str]:
|
| 80 |
-
"""
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
def _compute_idf(self, documents: List[str], query_terms: List[str]) -> Dict[str, float]:
|
| 84 |
"""计算IDF值"""
|
|
|
|
| 39 |
def __init__(self):
|
| 40 |
super().__init__()
|
| 41 |
self.name = "TFIDFReranker"
|
| 42 |
+
# 移除 stop_words 以支持中文,使用 char_wb 分词器
|
| 43 |
+
self.vectorizer = TfidfVectorizer(
|
| 44 |
+
analyzer='char_wb', # 字符级分词,支持中文
|
| 45 |
+
ngram_range=(2, 4), # 2-4 字符 n-gram
|
| 46 |
+
max_features=5000
|
| 47 |
+
)
|
| 48 |
|
| 49 |
def rerank(self, query: str, documents: List[dict], top_k: int = 5) -> List[Tuple[dict, float]]:
|
| 50 |
"""使用TF-IDF重新排序文档"""
|
|
|
|
| 82 |
self.b = b
|
| 83 |
|
| 84 |
def _tokenize(self, text: str) -> List[str]:
|
| 85 |
+
"""
|
| 86 |
+
改进的分词,支持中英文
|
| 87 |
+
中文使用字符级分词,英文使用单词分词
|
| 88 |
+
"""
|
| 89 |
+
# 检测是否包含中文
|
| 90 |
+
has_chinese = any('\u4e00' <= char <= '\u9fff' for char in text)
|
| 91 |
+
|
| 92 |
+
if has_chinese:
|
| 93 |
+
# 中文:使用字符级 + 2-gram
|
| 94 |
+
chars = list(text.lower())
|
| 95 |
+
# 生成 unigram 和 bigram
|
| 96 |
+
tokens = chars + [chars[i] + chars[i+1] for i in range(len(chars)-1)]
|
| 97 |
+
return [t for t in tokens if t.strip()] # 移除空格
|
| 98 |
+
else:
|
| 99 |
+
# 英文:使用单词分词
|
| 100 |
+
return re.findall(r'\b\w+\b', text.lower())
|
| 101 |
|
| 102 |
def _compute_idf(self, documents: List[str], query_terms: List[str]) -> Dict[str, float]:
|
| 103 |
"""计算IDF值"""
|