pdf提取腳本
發布時間: 2023-06-02 00:24:20
1. 如何利用python抓取PDF中的某些內容
可以轉換成TXT再抓取
fromcStringIOimportStringIO
frompdfminer.pdfinterp
importPDFResourceManager,PDFPageInterpreter
frompdfminer.converterimportTextConverter
frompdfminer..pdfpage
importPDFPage
defconvert_pdf_2_text(path):
rsrcmgr=PDFResourceManager()
retstr=StringIO()
device=TextConverter(rsrcmgr,retstr,codec='utf-8',laparams=LAParams())
interpreter=PDFPageInterpreter(rsrcmgr,device)
withopen(path,'rb')asfp:
forpageinPDFPage.get_pages(fp,set()):
interpreter.process_page(page)
text=retstr.getvalue()
device.close()
retstr.close()
returntext
熱點內容