Spaces:
Paused
Paused
| """ | |
| ๆต่ฏ CrossEncoder ้ๆๅ่ฝ | |
| ๅฏนๆฏ Bi-Encoder vs CrossEncoder ็ๆๆ | |
| """ | |
| from reranker import create_reranker, TFIDFReranker, BM25Reranker, SemanticReranker, CrossEncoderReranker | |
| class MockDoc: | |
| """ๆจกๆๆๆกฃ็ฑป""" | |
| def __init__(self, content, metadata=None): | |
| self.page_content = content | |
| self.metadata = metadata or {} | |
| class MockEmbeddings: | |
| """ๆจกๆ Embeddings ็ฑป๏ผ็จไบ Semantic Reranker๏ผ""" | |
| def embed_query(self, text): | |
| # ็ฎๅ็ๅญ็ฌฆ็บงๅ้ๅ๏ผไป ็จไบๆต่ฏ๏ผ | |
| return [ord(c) / 100.0 for c in text[:10]] | |
| def embed_documents(self, texts): | |
| return [self.embed_query(text) for text in texts] | |
| def create_test_documents(): | |
| """ๅๅปบๆต่ฏๆๆกฃ้""" | |
| return [ | |
| MockDoc("ไบบๅทฅๆบ่ฝๆฏ่ฎก็ฎๆบ็งๅญฆ็ไธไธชๅๆฏ๏ผ่ดๅไบๅๅปบ่ฝๅคๆง่ก้ๅธธ้่ฆไบบ็ฑปๆบ่ฝ็ไปปๅก็็ณป็ปใ"), | |
| MockDoc("ๆบๅจๅญฆไน ๆฏไบบๅทฅๆบ่ฝ็ๅญ้ขๅ๏ผไธๆณจไบ่ฎฉ่ฎก็ฎๆบไปๆฐๆฎไธญๅญฆไน ๅนถๆน่ฟใ"), | |
| MockDoc("ๆทฑๅบฆๅญฆไน ไฝฟ็จๅคๅฑ็ฅ็ป็ฝ็ปๆฅๅค็ๅคๆ็ๆฐๆฎๆจกๅผ๏ผๆฏๆบๅจๅญฆไน ็ไธ็งๆนๆณใ"), | |
| MockDoc("่ช็ถ่ฏญ่จๅค็๏ผNLP๏ผๆฏไบบๅทฅๆบ่ฝ็ไธไธชๅๆฏ๏ผๅค็่ฎก็ฎๆบไธไบบ็ฑป่ฏญ่จไน้ด็ไบคไบใ"), | |
| MockDoc("่ฎก็ฎๆบ่ง่งๆฏไบบๅทฅๆบ่ฝ็ๅฆไธไธช้่ฆ้ขๅ๏ผไฝฟๆบๅจ่ฝๅค็่งฃๅ่งฃ้่ง่งไฟกๆฏใ"), | |
| MockDoc("ไปๅคฉๅคฉๆฐๅพๅฅฝ๏ผ้ๅๅบๅปๆฃๆญฅๅ่ฟๅจใ"), | |
| MockDoc("Python ๆฏไธ็ง้ซ็บง็ผ็จ่ฏญ่จ๏ผ็ฑ Guido van Rossum ๅจ 1991 ๅนดๅๅปบใ"), | |
| MockDoc("RAG๏ผๆฃ็ดขๅขๅผบ็ๆ๏ผๆฏไธ็ง็ปๅไฟกๆฏๆฃ็ดขๅๆๆฌ็ๆ็ๆๆฏใ"), | |
| ] | |
| def test_tfidf_reranking(): | |
| """ๆต่ฏ TF-IDF ้ๆ""" | |
| print("\n" + "=" * 60) | |
| print("๐ ๆต่ฏ TF-IDF ้ๆ") | |
| print("=" * 60) | |
| query = "ไปไนๆฏไบบๅทฅๆบ่ฝๅๆบๅจๅญฆไน ๏ผ" | |
| docs = create_test_documents() | |
| reranker = TFIDFReranker() | |
| results = reranker.rerank(query, docs, top_k=3) | |
| print(f"\nๆฅ่ฏข: {query}") | |
| print("\nTF-IDF ้ๆ็ปๆ:") | |
| for i, (doc, score) in enumerate(results, 1): | |
| print(f"{i}. ๅๆฐ: {score:.4f} | ๅ ๅฎน: {doc.page_content[:50]}...") | |
| def test_bm25_reranking(): | |
| """ๆต่ฏ BM25 ้ๆ""" | |
| print("\n" + "=" * 60) | |
| print("๐ ๆต่ฏ BM25 ้ๆ") | |
| print("=" * 60) | |
| query = "ไปไนๆฏไบบๅทฅๆบ่ฝๅๆบๅจๅญฆไน ๏ผ" | |
| docs = create_test_documents() | |
| reranker = BM25Reranker() | |
| results = reranker.rerank(query, docs, top_k=3) | |
| print(f"\nๆฅ่ฏข: {query}") | |
| print("\nBM25 ้ๆ็ปๆ:") | |
| for i, (doc, score) in enumerate(results, 1): | |
| print(f"{i}. ๅๆฐ: {score:.4f} | ๅ ๅฎน: {doc.page_content[:50]}...") | |
| def test_crossencoder_reranking(): | |
| """ๆต่ฏ CrossEncoder ้ๆ""" | |
| print("\n" + "=" * 60) | |
| print("๐ ๆต่ฏ CrossEncoder ้ๆ๏ผๆจ่๏ผ") | |
| print("=" * 60) | |
| query = "ไปไนๆฏไบบๅทฅๆบ่ฝๅๆบๅจๅญฆไน ๏ผ" | |
| docs = create_test_documents() | |
| try: | |
| # ไฝฟ็จ่ฝป้็บงๆจกๅ | |
| reranker = CrossEncoderReranker( | |
| model_name="cross-encoder/ms-marco-MiniLM-L-6-v2" | |
| ) | |
| results = reranker.rerank(query, docs, top_k=3) | |
| print(f"\nๆฅ่ฏข: {query}") | |
| print("\nCrossEncoder ้ๆ็ปๆ:") | |
| for i, (doc, score) in enumerate(results, 1): | |
| print(f"{i}. ๅๆฐ: {score:.4f} | ๅ ๅฎน: {doc.page_content[:50]}...") | |
| return True | |
| except Exception as e: | |
| print(f"\nโ CrossEncoder ๆต่ฏๅคฑ่ดฅ: {e}") | |
| print("๐ก ๆ็คบ: ่ฏทๅ ๅฎ่ฃ sentence-transformers") | |
| print(" ๅฝไปค: pip install sentence-transformers") | |
| return False | |
| def test_factory_function(): | |
| """ๆต่ฏๅทฅๅๅฝๆฐ""" | |
| print("\n" + "=" * 60) | |
| print("๐ญ ๆต่ฏ้ๆๅจๅทฅๅๅฝๆฐ") | |
| print("=" * 60) | |
| query = "ๆทฑๅบฆๅญฆไน ๅ็ฅ็ป็ฝ็ป" | |
| docs = create_test_documents() | |
| # ๆต่ฏๅ็ง็ฑปๅ | |
| reranker_types = ['tfidf', 'bm25'] | |
| for rtype in reranker_types: | |
| try: | |
| reranker = create_reranker(rtype) | |
| results = reranker.rerank(query, docs, top_k=2) | |
| print(f"\nโ {rtype.upper()} ้ๆๅจๅๅปบๆๅ") | |
| print(f" Top 1: {results[0][1]:.4f} | {results[0][0].page_content[:40]}...") | |
| except Exception as e: | |
| print(f"\nโ {rtype.upper()} ้ๆๅจๅคฑ่ดฅ: {e}") | |
| # ๆต่ฏ CrossEncoder | |
| try: | |
| reranker = create_reranker('crossencoder') | |
| results = reranker.rerank(query, docs, top_k=2) | |
| print(f"\nโ CROSSENCODER ้ๆๅจๅๅปบๆๅ") | |
| print(f" Top 1: {results[0][1]:.4f} | {results[0][0].page_content[:40]}...") | |
| except Exception as e: | |
| print(f"\nโ CROSSENCODER ้ๆๅจๅคฑ่ดฅ: {e}") | |
| def compare_all_methods(): | |
| """ๅฏนๆฏๆๆ้ๆๆนๆณ""" | |
| print("\n" + "=" * 60) | |
| print("โ๏ธ ๅฏนๆฏๆๆ้ๆๆนๆณ") | |
| print("=" * 60) | |
| query = "่งฃ้ไธไธไบบๅทฅๆบ่ฝใๆบๅจๅญฆไน ๅๆทฑๅบฆๅญฆไน ็ๅ ณ็ณป" | |
| docs = create_test_documents() | |
| methods = { | |
| 'TF-IDF': TFIDFReranker(), | |
| 'BM25': BM25Reranker(), | |
| } | |
| # ๅฐ่ฏๆทปๅ CrossEncoder | |
| try: | |
| methods['CrossEncoder'] = CrossEncoderReranker() | |
| except: | |
| print("\nโ ๏ธ CrossEncoder ไธๅฏ็จ๏ผ่ทณ่ฟ") | |
| print(f"\nๆฅ่ฏข: {query}\n") | |
| for method_name, reranker in methods.items(): | |
| try: | |
| results = reranker.rerank(query, docs, top_k=3) | |
| print(f"\n{'=' * 40}") | |
| print(f"{method_name} ้ๆ็ปๆ:") | |
| print('=' * 40) | |
| for i, (doc, score) in enumerate(results, 1): | |
| print(f"{i}. [{score:.4f}] {doc.page_content[:60]}...") | |
| except Exception as e: | |
| print(f"\n{method_name} ๅคฑ่ดฅ: {e}") | |
| def performance_comparison(): | |
| """ๆง่ฝๅฏนๆฏ""" | |
| print("\n" + "=" * 60) | |
| print("โก ๆง่ฝไธๅ็กฎๆงๅฏนๆฏ") | |
| print("=" * 60) | |
| print(""" | |
| ้ๆๆนๆณๅฏนๆฏ๏ผ | |
| โโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโฌโโโโโโโโโโโฌโโโโโโโโโโโฌโโโโโโโโโโโโโ | |
| โ ๆนๆณ โ ๅ็กฎ็ โ ้ๅบฆ โ ๆๆฌ โ ้็จๅบๆฏ โ | |
| โโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโผโโโโโโโโโโโผโโโโโโโโโโโผโโโโโโโโโโโโโค | |
| โ TF-IDF โ โญโญ โ โกโกโก โ ๆไฝ โ ๅ ณ้ฎ่ฏๅน้ โ | |
| โ BM25 โ โญโญโญ โ โกโกโก โ ๆไฝ โ ๆๆฌๆฃ็ดข โ | |
| โ Bi-Encoder โ โญโญโญโญ โ โกโก โ ไฝ โ ่ฏญไนๆฃ็ดข โ | |
| โ CrossEncoder ๐ โ โญโญโญโญโญโ โก โ ไธญ โ ็ฒพๅ้ๆ โ | |
| โ Hybrid โ โญโญโญโญ โ โกโก โ ไฝ โ ็ปผๅๅบๆฏ โ | |
| โโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโดโโโโโโโโโโโดโโโโโโโโโโโดโโโโโโโโโโโโโ | |
| ๆจ่้ ็ฝฎ๏ผ | |
| 1๏ธโฃ ไธค้ถๆฎตๆฃ็ดข๏ผBi-Encoder (ๅฟซ้ๅฌๅ) + CrossEncoder (็ฒพๅ้ๆ) | |
| 2๏ธโฃ ๅ็กฎ็ไผๅ ๏ผ็บฏ CrossEncoder | |
| 3๏ธโฃ ้ๅบฆไผๅ ๏ผBM25 ๆ Hybrid | |
| ๅฝๅ้กน็ฎ้ ็ฝฎ๏ผ | |
| โ ๅทฒๅๆขๅฐ CrossEncoder ้ๆ | |
| ๐ ๅ็กฎ็้ขๆๆๅ๏ผ15-20% | |
| โก ้ๅบฆ๏ผๅๆฌก้ๆ 20-100ms (Top 20 ๆๆกฃ) | |
| """) | |
| if __name__ == "__main__": | |
| print("\n๐ ๅผๅงๆต่ฏ CrossEncoder ้ๆๅ่ฝ...\n") | |
| # 1. ๆต่ฏ TF-IDF | |
| test_tfidf_reranking() | |
| # 2. ๆต่ฏ BM25 | |
| test_bm25_reranking() | |
| # 3. ๆต่ฏ CrossEncoder (้็น) | |
| crossencoder_available = test_crossencoder_reranking() | |
| # 4. ๆต่ฏๅทฅๅๅฝๆฐ | |
| test_factory_function() | |
| # 5. ๅฏนๆฏๆๆๆนๆณ | |
| compare_all_methods() | |
| # 6. ๆง่ฝๅฏนๆฏๆป็ป | |
| performance_comparison() | |
| print("\n" + "=" * 60) | |
| if crossencoder_available: | |
| print("โ ๆๆๆต่ฏๅฎๆ๏ผCrossEncoder ้ๆๅทฒๅฐฑ็ปช") | |
| else: | |
| print("โ ๏ธ ๆต่ฏๅฎๆ๏ผไฝ CrossEncoder ไธๅฏ็จ") | |
| print(" ่ฏท่ฟ่ก: pip install sentence-transformers") | |
| print("=" * 60 + "\n") | |