Python PDF、WORD、EXCEL、PPTのテキストを読み込む

9129 ワード

文書ディレクトリ
  • シーン
  • PDF
  • WORD
  • 読み出し段落
  • を読み取る.
  • EXCEL
  • PPT

  • シーン
    ファイル内のテキストの内容を取得(読み取り専用で書き込みなし)
    PDF
    インストール:pip install pdfminer3k
    from io import StringIO
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfinterp import PDFResourceManager, process_pdf
    
    def read_pdf(path_pdf):
        with open(path_pdf, 'rb') as pdf:
            # resource manager
            # PDF     
            rsrcmgr = PDFResourceManager()
            #   str   
            outfp = StringIO()
            #   PDF   
            laparams = LAParams()
            #      
            device = TextConverter(rsrcmgr, outfp, laparams=laparams)
            #     
            process_pdf(rsrcmgr, device, pdf)
            #           
            content = outfp.getvalue()
            #     
            device.close()
            outfp.close()
            return content  # 
    
    if __name__ == '__main__':
        lines = read_pdf('P020190716349644060705.pdf')
        print(lines)
    

    WORD
    インストール:pip install python-docx段落の読み込み
    from docx import Document
    #     
    d = Document('a.docx')
    #     
    for paragraph in d.paragraphs:
        print(paragraph.text)
    

    表の読み込み
    from docx import Document
    #     
    d = Document('a.docx')
    #     
    for table in d.tables:
        for row in table.rows:
            for cell in row.cells:
                print(cell.text)
    

    EXCEL
    from pandas import read_excel
    
    def xlsx2df(fname, sheet_name=0):
        return read_excel(fname, sheet_name)
    

    PPT
    インストール:pip install python-pptx
    import pptx
    #   PPT
    p = pptx.Presentation('a.pptx')
    #      
    for slide in p.slides:
        #           
        for shape in slide.shapes:
            #    
            if isinstance(shape, pptx.shapes.placeholder.SlidePlaceholder):
                for paragraph in shape.text_frame.paragraphs:
                    print(paragraph.text)
            #   
            if isinstance(shape, pptx.shapes.graphfrm.GraphicFrame):
                for cell in shape.table.iter_cells():
                    print(cell.text)