lanny xu commited on
Commit
de81fb9
·
1 Parent(s): ae2e9ee

delete vectara

Browse files
Files changed (1) hide show
  1. kaggle_simple_multimodal.py +65 -15
kaggle_simple_multimodal.py CHANGED
@@ -127,17 +127,24 @@ def scan_and_copy_files():
127
  # 递归扫描所有文件
128
  for root, dirs, files in os.walk(input_dir):
129
  for file in files:
 
 
 
 
130
  src = os.path.join(root, file)
131
  dst = os.path.join(working_dir, file)
132
 
133
- if file.endswith('.pdf'):
134
- shutil.copy(src, dst)
135
- copied_pdfs.append(file)
136
- print(f" ✅ 复制 PDF: {file}")
137
- elif any(file.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']):
138
- shutil.copy(src, dst)
139
- copied_images.append(file)
140
- print(f" ✅ 复制图片: {file}")
 
 
 
141
 
142
  if copied_pdfs or copied_images:
143
  print(f"\n📁 复制完成: {len(copied_pdfs)} 个 PDF, {len(copied_images)} 张图片")
@@ -157,17 +164,40 @@ def main():
157
 
158
  # 检查文件
159
  working_dir = '/kaggle/working'
160
- pdf_files = [f for f in os.listdir(working_dir) if f.endswith('.pdf')]
161
- image_files = [f for f in os.listdir(working_dir) if any(f.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp'])]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  print(f"\n📁 /kaggle/working/ 中的文件:")
164
  print(f" - PDF文件: {len(pdf_files)} 个")
165
  for pdf in pdf_files:
166
- print(f" * {pdf}")
 
 
167
 
168
  print(f" - 图片文件: {len(image_files)} 个")
169
  for img in image_files:
170
- print(f" * {img}")
 
 
171
 
172
  if not pdf_files and not image_files:
173
  print("\n💡 使用说明:")
@@ -175,11 +205,31 @@ def main():
175
  print(" 2. 选择 'Upload' 标签")
176
  print(" 3. 上传你的 PDF 和图片文件")
177
  print(" 4. 重新运行此脚本")
 
 
 
 
 
178
  return
179
 
180
- # 处理文件
181
- pdf_path = os.path.join(working_dir, pdf_files[0]) if pdf_files else None
182
- image_paths = [os.path.join(working_dir, img) for img in image_files] if image_files else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  rag_system, doc_processor = process_uploaded_files(pdf_path, image_paths)
185
 
 
127
  # 递归扫描所有文件
128
  for root, dirs, files in os.walk(input_dir):
129
  for file in files:
130
+ # 跳过无效文件名
131
+ if not file or file.startswith('.') or len(file) < 5:
132
+ continue
133
+
134
  src = os.path.join(root, file)
135
  dst = os.path.join(working_dir, file)
136
 
137
+ try:
138
+ if file.endswith('.pdf'):
139
+ shutil.copy(src, dst)
140
+ copied_pdfs.append(file)
141
+ print(f" ✅ 复制 PDF: {file}")
142
+ elif any(file.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']):
143
+ shutil.copy(src, dst)
144
+ copied_images.append(file)
145
+ print(f" ✅ 复制图片: {file}")
146
+ except Exception as e:
147
+ print(f" ⚠️ 复制文件失败 {file}: {e}")
148
 
149
  if copied_pdfs or copied_images:
150
  print(f"\n📁 复制完成: {len(copied_pdfs)} 个 PDF, {len(copied_images)} 张图片")
 
164
 
165
  # 检查文件
166
  working_dir = '/kaggle/working'
167
+
168
+ # 过滤有效的PDF文件(排除空文件名和隐藏文件)
169
+ try:
170
+ all_files = os.listdir(working_dir)
171
+ pdf_files = [
172
+ f for f in all_files
173
+ if f.endswith('.pdf')
174
+ and len(f) > 4 # 确保不只是 '.pdf'
175
+ and not f.startswith('.') # 排除隐藏文件
176
+ and os.path.isfile(os.path.join(working_dir, f)) # 确保是文件
177
+ ]
178
+ image_files = [
179
+ f for f in all_files
180
+ if any(f.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp'])
181
+ and not f.startswith('.') # 排除隐藏文件
182
+ and os.path.isfile(os.path.join(working_dir, f)) # 确保是文件
183
+ ]
184
+ except Exception as e:
185
+ print(f"❌ 扫描文件时出错: {e}")
186
+ pdf_files = []
187
+ image_files = []
188
 
189
  print(f"\n📁 /kaggle/working/ 中的文件:")
190
  print(f" - PDF文件: {len(pdf_files)} 个")
191
  for pdf in pdf_files:
192
+ pdf_path = os.path.join(working_dir, pdf)
193
+ file_size = os.path.getsize(pdf_path) if os.path.exists(pdf_path) else 0
194
+ print(f" * {pdf} ({file_size/1024:.1f} KB)")
195
 
196
  print(f" - 图片文件: {len(image_files)} 个")
197
  for img in image_files:
198
+ img_path = os.path.join(working_dir, img)
199
+ file_size = os.path.getsize(img_path) if os.path.exists(img_path) else 0
200
+ print(f" * {img} ({file_size/1024:.1f} KB)")
201
 
202
  if not pdf_files and not image_files:
203
  print("\n💡 使用说明:")
 
205
  print(" 2. 选择 'Upload' 标签")
206
  print(" 3. 上传你的 PDF 和图片文件")
207
  print(" 4. 重新运行此脚本")
208
+ print("\n🔍 当前目录内容:")
209
+ try:
210
+ print(f" {os.listdir(working_dir)}")
211
+ except:
212
+ pass
213
  return
214
 
215
+ # 处理文件(添加路径验证)
216
+ if pdf_files:
217
+ pdf_path = os.path.join(working_dir, pdf_files[0])
218
+ if not os.path.exists(pdf_path) or not os.path.isfile(pdf_path):
219
+ print(f"❌ PDF 文件路径无效: {pdf_path}")
220
+ pdf_path = None
221
+ else:
222
+ pdf_path = None
223
+
224
+ if image_files:
225
+ image_paths = []
226
+ for img in image_files:
227
+ img_path = os.path.join(working_dir, img)
228
+ if os.path.exists(img_path) and os.path.isfile(img_path):
229
+ image_paths.append(img_path)
230
+ image_paths = image_paths if image_paths else None
231
+ else:
232
+ image_paths = None
233
 
234
  rag_system, doc_processor = process_uploaded_files(pdf_path, image_paths)
235