pythonでPDFファイルを解析する
1937 ワード
import importlib
import sys
importlib.reload(sys)
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import * #
#LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal text
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
'''
pdf , txt
'''
path = ' .pdf'
def parse1():
fp = open(path, 'rb') #
# pdf
praser = PDFParser(fp)
# PDF
doc = PDFDocument()
#
praser.set_document(doc)
doc.set_parser(praser)
#
#
doc.initialize()
# txt ,
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
# PDf
rsrcmgr = PDFResourceManager()
# PDF
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# PDF
interpreter = PDFPageInterpreter(rsrcmgr, device)
with open('22.txt', 'w') as f:
f.seek(0)
f.truncate() #
# , page
for page in doc.get_pages(): # doc.get_pages() page
interpreter.process_page(page)
# LTPage
layout = device.get_result()
# layout LTPage page LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal text ,
for x in layout:
if (isinstance(x, LTTextBoxHorizontal)):
results = x.get_text()
print(results)
f.write(results + '
')
if __name__ == '__main__':
parse1()