pythonでPDFファイルを解析する

1937 ワード

import importlib
import sys

importlib.reload(sys)
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import *  #           
#LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal                text  
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed

'''
   pdf   ,   txt   
'''
path = '    .pdf'


def parse1():
    fp = open(path, 'rb')  #          
    #           pdf     
    praser = PDFParser(fp)
    #     PDF  
    doc = PDFDocument()
    #            
    praser.set_document(doc)
    doc.set_parser(praser)
    #        
    #                  
    doc.initialize()
    #         txt  ,      
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #   PDf              
        rsrcmgr = PDFResourceManager()
        #     PDF    
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        #     PDF     
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        with open('22.txt', 'w') as f:
            f.seek(0)
            f.truncate()  #   
            #       ,      page   
            for page in doc.get_pages():  # doc.get_pages()   page  
                interpreter.process_page(page)
                #       LTPage  
                layout = device.get_result()
                #   layout   LTPage           page             LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal                text  ,


                for x in layout:
                    if (isinstance(x, LTTextBoxHorizontal)):
                        results = x.get_text()
                        print(results)
                        f.write(results + '
') if __name__ == '__main__': parse1()