Spaces:
Build error
Build error
| #擦,忘记了本来是在本地电脑上写的代码,忘了上传服务器文件地址写法代码得重构,,, | |
| import streamlit as st | |
| import math | |
| import re | |
| import os | |
| from PyPDF2 import PdfFileReader, PdfFileWriter | |
| import pandas as pd | |
| import pdfplumber | |
| from docx2pdf import convert | |
| import fitz | |
| import base64 | |
| st.header('PDF文件处理工具测试') | |
| def fx(x): | |
| return sum(x,[]) | |
| fns=st.radio('请选择PDF处理类型:',['拆分','合并','读取','在线预览','转换']) | |
| if fns=='拆分': | |
| uploaded_file = st.text_input("请输入要处理的pdf文件地址:") | |
| if uploaded_file !='': | |
| pdf_reader = PdfFileReader(uploaded_file) | |
| n=pdf_reader.getNumPages() | |
| che=st.radio('选择拆分类型',['按固定页数拆分','截取某几页','删除指定页面']) | |
| if che=='按固定页数拆分': | |
| fn=st.number_input('请输入每组拆分的文档页数:',1,n,1) | |
| stre=st.text_input("请输入拆分后文件存放根目录:") | |
| zs=math.ceil(n/fn) | |
| if st.button('开始拆分>>'): | |
| for page in range(1,zs+1): | |
| for pn in range(fn*page-fn,fn*page): | |
| if pn<n: | |
| pdf_writer = PdfFileWriter() | |
| pdf_writer.addPage(pdf_reader.getPage(pn)) | |
| with open(stre+'/test-{}.pdf'.format(page), 'wb') as out: | |
| pdf_writer.write(out) | |
| elif che=='截取某几页': | |
| st_en=st.text_input("请输入截取的起止页码,格式为“1-5”或“1,3,5”:") | |
| stre2=st.text_input("请输入截取后pdf文件存放根目录:") | |
| if st_en!='': | |
| tt=[int(x) for x in re.split(r'[-,\s]\s*',st_en)] | |
| if st.button('开始截取>>'): | |
| outw=PdfFileWriter() | |
| for r in (tt if ',' in st_en else range(tt[0]-1,tt[1])): | |
| outw.addPage(pdf_reader.getPage(r)) | |
| with open(stre2+'/666.pdf', 'wb') as out: | |
| outw.write(out) | |
| else: | |
| st_en2=st.text_input("请输入需要删除的页码,格式为“1-5”或“1,3,5”:") | |
| stre3=st.text_input("请输入删除指定页面后的pdf文件存放根目录:") | |
| if st_en2!='': | |
| tt=[int(x) for x in re.split(r'[-,\s]\s*',st_en2)] | |
| if st.button('开始删除>>'): | |
| outw2=PdfFileWriter() | |
| for r in range(n): | |
| if r not in (tt if ',' in st_en2 else range(tt[0]-1,tt[1])): | |
| outw2.addPage(pdf_reader.getPage(r)) | |
| with open(stre3+'/666.pdf', 'wb') as out: | |
| outw2.write(out) | |
| elif fns=='合并': | |
| path = st.text_input("请输入要处理的pdf文件根目录:") | |
| scn = st.text_input("请填写输出文件地址及文件名") | |
| if path !='' and scn!='': | |
| file_list = os.listdir(path) | |
| if st.button('开始合并>>'): | |
| file_out = PdfFileWriter() | |
| for file in file_list: | |
| docdir = os.path.join(path, file) | |
| file_read = PdfFileReader(docdir) | |
| for pageNum in range(file_read.getNumPages()): | |
| file_out.addPage(file_read.getPage(pageNum)) | |
| with open(scn,'wb') as output: | |
| file_out.write(output) | |
| elif fns=='读取': | |
| path3 = st.text_input("请输入要读取的pdf文件地址:") | |
| if path3 !='': | |
| ms=st.radio('请选择读取模式:',['指定页码','全部']) | |
| if ms=='指定页码': | |
| ymq= st.number_input("请选择要读取的pdf页码:",1,66,1) | |
| dqlx=st.radio('请选择读取类型',['文本内容','表格内容']) | |
| with pdfplumber.open(path3) as p: | |
| page = p.pages[ymq-1] | |
| if dqlx=='文本内容': | |
| textdata = page.extract_text() | |
| st.write(textdata) | |
| else: | |
| n_table=st.number_input('请选择读取页面中第几个表格:',1,3,1) | |
| tables=page.extract_tables() | |
| datan=tables[n_table-1] | |
| st.dataframe(pd.DataFrame(datan[1:],columns=datan[0])) | |
| else: | |
| dqlx2=st.radio('请选择读取类型',['文本内容','表格内容']) | |
| with pdfplumber.open(path3) as p: | |
| if dqlx2=='文本内容': | |
| sz='\n'.join([page.extract_text() for page in p.pages]) | |
| st.write(sz) | |
| else: | |
| st.dataframe(pd.concat([pd.DataFrame(data=y[1:],columns=y[0]) for y in fx([page.extract_tables() for page in p.pages])])) | |
| elif fns=='在线预览': | |
| file = st.file_uploader("请上传PDF") | |
| if file is not None: | |
| base64_pdf = base64.b64encode(file.read()).decode('utf-8') | |
| pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" width="100%" height="1000" type="application/pdf">' | |
| st.markdown(pdf_display, unsafe_allow_html=True) | |
| else: | |
| ms1=st.radio('请选择转换模式:',['word->pdf','ppt->pdf','pdf->jpg/png','jpg/png->pdf']) | |
| if ms1=='word->pdf': | |
| path4 = st.text_input("请输入要批量转换的word文件根目录:") | |
| if path4 !='': | |
| FileList = map(lambda x: path4 + '\\' + x, os.listdir(path4)) | |
| for file in FileList: | |
| convert(file, f"{file.split('.')[0]}.pdf") | |
| st.success('转换成功!') | |
| elif ms1=='pdf->jpg/png': | |
| path5 = st.text_input("请输入要转换的pdf文件地址:") | |
| dir_1=st.text_input("请输入要输出的图片保存根目录:") | |
| if path5 !='' and dir_1 !='': | |
| doc = fitz.open(path5) | |
| for page in doc: | |
| pix = page.get_pixmap() | |
| pix.save(dir_1+"/page-%i.png" % page.number) | |
| elif ms1=='jpg/png->pdf': | |
| dir_2=st.text_input("请输入要转换为pdf的图片根目录:") | |
| path6 = st.text_input("请输入合成的pdf文件存放地址:") | |
| if path6 !='' and dir_2 !='': | |
| doc = fitz.open() | |
| imglist = os.listdir(dir_2) | |
| for i, f in enumerate(imglist): | |
| img = fitz.open(os.path.join(dir_2, f)) | |
| rect = img[0].rect | |
| pdfbytes = img.convert_to_pdf() | |
| img.close() | |
| imgPDF = fitz.open("pdf", pdfbytes) | |
| page = doc.new_page(width = rect.width,height = rect.height) | |
| page.show_pdf_page(rect, imgPDF, 0) | |
| doc.save(path6) | |
| elif ms1=='ppt->pdf': | |
| dir_3=st.text_input("请输入要转换为pdf的PPT文件地址:") | |
| path7 = st.text_input("请输入生成的pdf文件存放地址:") | |
| if path7 !='' and dir_3 !='': | |
| ppt = fitz.open(dir_3) | |
| pdfbytes = ppt.convert_to_pdf() | |
| pdf = fitz.open("pdf", pdfbytes) | |
| pdf.save(path7) | |
| else: | |
| "" |