Spaces:
Paused
Paused
lanny xu
commited on
Commit
·
de81fb9
1
Parent(s):
ae2e9ee
delete vectara
Browse files- kaggle_simple_multimodal.py +65 -15
kaggle_simple_multimodal.py
CHANGED
|
@@ -127,17 +127,24 @@ def scan_and_copy_files():
|
|
| 127 |
# 递归扫描所有文件
|
| 128 |
for root, dirs, files in os.walk(input_dir):
|
| 129 |
for file in files:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
src = os.path.join(root, file)
|
| 131 |
dst = os.path.join(working_dir, file)
|
| 132 |
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
if copied_pdfs or copied_images:
|
| 143 |
print(f"\n📁 复制完成: {len(copied_pdfs)} 个 PDF, {len(copied_images)} 张图片")
|
|
@@ -157,17 +164,40 @@ def main():
|
|
| 157 |
|
| 158 |
# 检查文件
|
| 159 |
working_dir = '/kaggle/working'
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
print(f"\n📁 /kaggle/working/ 中的文件:")
|
| 164 |
print(f" - PDF文件: {len(pdf_files)} 个")
|
| 165 |
for pdf in pdf_files:
|
| 166 |
-
|
|
|
|
|
|
|
| 167 |
|
| 168 |
print(f" - 图片文件: {len(image_files)} 个")
|
| 169 |
for img in image_files:
|
| 170 |
-
|
|
|
|
|
|
|
| 171 |
|
| 172 |
if not pdf_files and not image_files:
|
| 173 |
print("\n💡 使用说明:")
|
|
@@ -175,11 +205,31 @@ def main():
|
|
| 175 |
print(" 2. 选择 'Upload' 标签")
|
| 176 |
print(" 3. 上传你的 PDF 和图片文件")
|
| 177 |
print(" 4. 重新运行此脚本")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
return
|
| 179 |
|
| 180 |
-
#
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
rag_system, doc_processor = process_uploaded_files(pdf_path, image_paths)
|
| 185 |
|
|
|
|
| 127 |
# 递归扫描所有文件
|
| 128 |
for root, dirs, files in os.walk(input_dir):
|
| 129 |
for file in files:
|
| 130 |
+
# 跳过无效文件名
|
| 131 |
+
if not file or file.startswith('.') or len(file) < 5:
|
| 132 |
+
continue
|
| 133 |
+
|
| 134 |
src = os.path.join(root, file)
|
| 135 |
dst = os.path.join(working_dir, file)
|
| 136 |
|
| 137 |
+
try:
|
| 138 |
+
if file.endswith('.pdf'):
|
| 139 |
+
shutil.copy(src, dst)
|
| 140 |
+
copied_pdfs.append(file)
|
| 141 |
+
print(f" ✅ 复制 PDF: {file}")
|
| 142 |
+
elif any(file.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']):
|
| 143 |
+
shutil.copy(src, dst)
|
| 144 |
+
copied_images.append(file)
|
| 145 |
+
print(f" ✅ 复制图片: {file}")
|
| 146 |
+
except Exception as e:
|
| 147 |
+
print(f" ⚠️ 复制文件失败 {file}: {e}")
|
| 148 |
|
| 149 |
if copied_pdfs or copied_images:
|
| 150 |
print(f"\n📁 复制完成: {len(copied_pdfs)} 个 PDF, {len(copied_images)} 张图片")
|
|
|
|
| 164 |
|
| 165 |
# 检查文件
|
| 166 |
working_dir = '/kaggle/working'
|
| 167 |
+
|
| 168 |
+
# 过滤有效的PDF文件(排除空文件名和隐藏文件)
|
| 169 |
+
try:
|
| 170 |
+
all_files = os.listdir(working_dir)
|
| 171 |
+
pdf_files = [
|
| 172 |
+
f for f in all_files
|
| 173 |
+
if f.endswith('.pdf')
|
| 174 |
+
and len(f) > 4 # 确保不只是 '.pdf'
|
| 175 |
+
and not f.startswith('.') # 排除隐藏文件
|
| 176 |
+
and os.path.isfile(os.path.join(working_dir, f)) # 确保是文件
|
| 177 |
+
]
|
| 178 |
+
image_files = [
|
| 179 |
+
f for f in all_files
|
| 180 |
+
if any(f.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp'])
|
| 181 |
+
and not f.startswith('.') # 排除隐藏文件
|
| 182 |
+
and os.path.isfile(os.path.join(working_dir, f)) # 确保是文件
|
| 183 |
+
]
|
| 184 |
+
except Exception as e:
|
| 185 |
+
print(f"❌ 扫描文件时出错: {e}")
|
| 186 |
+
pdf_files = []
|
| 187 |
+
image_files = []
|
| 188 |
|
| 189 |
print(f"\n📁 /kaggle/working/ 中的文件:")
|
| 190 |
print(f" - PDF文件: {len(pdf_files)} 个")
|
| 191 |
for pdf in pdf_files:
|
| 192 |
+
pdf_path = os.path.join(working_dir, pdf)
|
| 193 |
+
file_size = os.path.getsize(pdf_path) if os.path.exists(pdf_path) else 0
|
| 194 |
+
print(f" * {pdf} ({file_size/1024:.1f} KB)")
|
| 195 |
|
| 196 |
print(f" - 图片文件: {len(image_files)} 个")
|
| 197 |
for img in image_files:
|
| 198 |
+
img_path = os.path.join(working_dir, img)
|
| 199 |
+
file_size = os.path.getsize(img_path) if os.path.exists(img_path) else 0
|
| 200 |
+
print(f" * {img} ({file_size/1024:.1f} KB)")
|
| 201 |
|
| 202 |
if not pdf_files and not image_files:
|
| 203 |
print("\n💡 使用说明:")
|
|
|
|
| 205 |
print(" 2. 选择 'Upload' 标签")
|
| 206 |
print(" 3. 上传你的 PDF 和图片文件")
|
| 207 |
print(" 4. 重新运行此脚本")
|
| 208 |
+
print("\n🔍 当前目录内容:")
|
| 209 |
+
try:
|
| 210 |
+
print(f" {os.listdir(working_dir)}")
|
| 211 |
+
except:
|
| 212 |
+
pass
|
| 213 |
return
|
| 214 |
|
| 215 |
+
# 处理文件(添加路径验证)
|
| 216 |
+
if pdf_files:
|
| 217 |
+
pdf_path = os.path.join(working_dir, pdf_files[0])
|
| 218 |
+
if not os.path.exists(pdf_path) or not os.path.isfile(pdf_path):
|
| 219 |
+
print(f"❌ PDF 文件路径无效: {pdf_path}")
|
| 220 |
+
pdf_path = None
|
| 221 |
+
else:
|
| 222 |
+
pdf_path = None
|
| 223 |
+
|
| 224 |
+
if image_files:
|
| 225 |
+
image_paths = []
|
| 226 |
+
for img in image_files:
|
| 227 |
+
img_path = os.path.join(working_dir, img)
|
| 228 |
+
if os.path.exists(img_path) and os.path.isfile(img_path):
|
| 229 |
+
image_paths.append(img_path)
|
| 230 |
+
image_paths = image_paths if image_paths else None
|
| 231 |
+
else:
|
| 232 |
+
image_paths = None
|
| 233 |
|
| 234 |
rag_system, doc_processor = process_uploaded_files(pdf_path, image_paths)
|
| 235 |
|