python PDF回転doc

6719 ワード

pdfminerバージョンpdfminer 20191125注:pdfはコピー可能である必要があり、ピクチャタイプはサポートされていない
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfpage import PDFPage


def parse(read_path,res_path):
    '''  PDF  ,    doc   '''
    fp = open(read_path, "rb")
    #          PDF     
    parser = PDFParser(fp)
    #     PDF  
    doc = PDFDocument(parser)
    #      ,     

    #        ,      ,          
    # doc.initialize()

    #         txt  ,      
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #   PDF,     ,     
        rsrcmgr = PDFResourceManager()
        #     PDF    
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        #     PDF     
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        #       ,      page  
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            #       LTPage  
            layout = device.get_result()
            #   layout   LTPage           page        
            #     LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal   
            #             text  ,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(res_path, "a", encoding="utf-8") as f:
                        results = x.get_text()
                        f.write(results + "
"
) if __name__ == '__main__': parse(r"E:/read_path", r"E:/res_path.doc")