python PDF回転doc
6719 ワード
pdfminerバージョンpdfminer 20191125注:pdfはコピー可能である必要があり、ピクチャタイプはサポートされていない
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfpage import PDFPage
def parse(read_path,res_path):
''' PDF , doc '''
fp = open(read_path, "rb")
# PDF
parser = PDFParser(fp)
# PDF
doc = PDFDocument(parser)
# ,
# , ,
# doc.initialize()
# txt ,
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
# PDF, ,
rsrcmgr = PDFResourceManager()
# PDF
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# PDF
interpreter = PDFPageInterpreter(rsrcmgr, device)
# , page
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
# LTPage
layout = device.get_result()
# layout LTPage page
# LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal
# text ,
for x in layout:
if (isinstance(x, LTTextBoxHorizontal)):
with open(res_path, "a", encoding="utf-8") as f:
results = x.get_text()
f.write(results + "
")
if __name__ == '__main__':
parse(r"E:/read_path", r"E:/res_path.doc")