lanny xu commited on
Commit
69629dd
·
1 Parent(s): 67e46c9
Files changed (2) hide show
  1. document_processor.py +29 -1
  2. main_graphrag.py +7 -5
document_processor.py CHANGED
@@ -49,6 +49,14 @@ from PIL import Image
49
  import numpy as np
50
  from typing import List, Dict, Any, Optional, Union
51
 
 
 
 
 
 
 
 
 
52
 
53
  class CustomEnsembleRetriever:
54
  """自定义集成检索器,结合向量检索和BM25检索"""
@@ -264,6 +272,26 @@ class DocumentProcessor:
264
 
265
  print(f"✅ 向量数据库创建完成并持久化到: {persist_directory}")
266
  return self.vectorstore, self.retriever
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
  def setup_knowledge_base(self, urls=None, enable_graphrag=False):
269
  """设置完整的知识库(加载、分割、向量化)
@@ -626,4 +654,4 @@ def initialize_document_processor():
626
  except Exception as e:
627
  print(f"⚠️ 保存元数据失败: {e}")
628
 
629
- return processor, vectorstore, retriever, doc_splits
 
49
  import numpy as np
50
  from typing import List, Dict, Any, Optional, Union
51
 
52
+ try:
53
+ from langchain_core.documents import Document
54
+ except ImportError:
55
+ try:
56
+ from langchain_core.documents import Document
57
+ except ImportError:
58
+ from langchain.schema import Document
59
+
60
 
61
  class CustomEnsembleRetriever:
62
  """自定义集成检索器,结合向量检索和BM25检索"""
 
272
 
273
  print(f"✅ 向量数据库创建完成并持久化到: {persist_directory}")
274
  return self.vectorstore, self.retriever
275
+
276
+ def get_all_documents_from_vectorstore(self, limit: Optional[int] = None) -> List[Document]:
277
+ """从已持久化的向量数据库读取所有文档内容并构造 Document 列表"""
278
+ if not self.vectorstore:
279
+ return []
280
+ try:
281
+ data = self.vectorstore._collection.get(include=["documents", "metadatas"]) # type: ignore
282
+ docs_raw = data.get("documents") or []
283
+ metas = data.get("metadatas") or []
284
+ docs: List[Document] = []
285
+ for i, content in enumerate(docs_raw):
286
+ if content:
287
+ meta = metas[i] if i < len(metas) else {}
288
+ docs.append(Document(page_content=content, metadata=meta))
289
+ if limit:
290
+ return docs[:limit]
291
+ return docs
292
+ except Exception as e:
293
+ print(f"⚠️ 读取向量库文档失败: {e}")
294
+ return []
295
 
296
  def setup_knowledge_base(self, urls=None, enable_graphrag=False):
297
  """设置完整的知识库(加载、分割、向量化)
 
654
  except Exception as e:
655
  print(f"⚠️ 保存元数据失败: {e}")
656
 
657
+ return processor, vectorstore, retriever, doc_splits
main_graphrag.py CHANGED
@@ -70,14 +70,16 @@ class AdaptiveRAGWithGraph:
70
  else:
71
  print("📝 首次构建索引...")
72
 
73
- # 当持久化向量库已加载时,doc_splits 可能为 None;为 GraphRAG 索引补齐文档块
74
  if self.doc_splits is None:
75
- print(" ℹ️ 未提供文档块,重新加载默认数据源以供GraphRAG索引...")
76
  try:
77
- docs = self.doc_processor.load_documents()
78
- self.doc_splits = self.doc_processor.split_documents(docs)
 
 
 
 
79
  except Exception as e:
80
- print(f" ❌ 重新加载文档失败: {e}")
81
  raise
82
 
83
  # 构建索引
 
70
  else:
71
  print("📝 首次构建索引...")
72
 
 
73
  if self.doc_splits is None:
 
74
  try:
75
+ docs_from_vs = self.doc_processor.get_all_documents_from_vectorstore()
76
+ if docs_from_vs:
77
+ self.doc_splits = docs_from_vs
78
+ else:
79
+ docs = self.doc_processor.load_documents()
80
+ self.doc_splits = self.doc_processor.split_documents(docs)
81
  except Exception as e:
82
+ print(f" ❌ 准备GraphRAG文档块失败: {e}")
83
  raise
84
 
85
  # 构建索引