電子請求書情報抽出ツール(Python)を共有

7308 ワード
電子領収書が多すぎて、総額を統計するのは異常に困難で、ネット上のツールは使いにくくて、2時間かけて1部を実現して、中石油、京東が発行した電子領収書をテストしたことがあってもいいですが、一部の領収書の名前が失敗しても統計に影響しません.必要な仲間は自分で持って行って直しましょう.

import cmd
import sys
import json
import pdfplumber
import os
from pprint import pprint


class FapiaoShell(cmd.Cmd):
    """    """

    intro = '          ，  ?(help)           ，CTRL+C    。
'
    prompt = '
    : '
    doc_header = "     (   help ):"
    misc_header = "    :"
    undoc_header = "      :"
    nohelp = "***     (%s)      "

    def __init__(self):
        super().__init__()

    def do_load(self, arg):
        """        ：load D:\ """
        if not os.path.isdir(arg):
            print('       !')
            return

        os.chdir(os.path.dirname(arg))
        pdfs = []
        for root, _, files in os.walk(arg):
            for fn in files:
                ext = os.path.splitext(fn)[1].lower()
                if ext != '.pdf':
                    continue
                fpth = os.path.join(root, fn)
                fpth = os.path.relpath(fpth)
                print(f'  pdf  : {fpth}')
                pdfs.append(fpth)

        pdf_ctxs = self._parse_pdfs(pdfs)
        total = {
            '  ': pdf_ctxs,
            '   ': len(pdf_ctxs),
            '  ': 0,
        }
        for fpth, info in pdf_ctxs:
            total['  '] += float(info['  '])

        print('
      .json...')

        with open("  .json", 'w', encoding='utf-8') as json_file:
            json.dump(total,
                      json_file,
                      ensure_ascii=False,
                      sort_keys=True,
                      indent=4,
                      separators=(', ', ': '))

        print('  !')

    def _parse_pdfs(self, pdfs):
        """    """
        result = []
        for fpth in pdfs:
            info = {}
            with pdfplumber.open(fpth) as pdf:
                page = pdf.pages[0]

                if '         ' not in ''.join(page.extract_text()):
                    result.append((fpth, {}))

                inf = self._extrace_from_words(page.extract_words())
                info.update(inf)

                inf = self._extrace_from_table(page.extract_tables()[0])
                info.update(inf)

            result.append((fpth, info))
        return result

    def _extrace_from_words(self, words):
        """        """
        info = {}

        lines = {}
        for word in words:
            top = int(word['top'])
            bottom = int(word['bottom'])
            pos = (top + bottom) // 2
            text = word['text']
            if pos not in lines:
                lines[pos] = [text]
            else:
                lines[pos].append(text)

        lines_pack = []
        last_pos = None
        for pos in sorted(lines):
            arr = lines[pos]

            if len(lines_pack) > 0 and pos - last_pos <= 10:
                lines_pack[-1] += arr
                continue

            lines_pack.append(arr)
            last_pos = pos
            continue

        for pack in lines_pack:
            for idx, line in enumerate(pack):
                if '      ' in line:
                    info['  '] = line
                    continue

                if '    :' in line:
                    info['    '] = line.split(':')[1]
                    continue

                if '    :' in line:
                    info['    '] = line.split(':')[1]
                    continue

                if '    :' in line:
                    year = line.split(':')[1]
                    month = [ln for ln in pack if ln.isdigit()][0]
                    day = [ln[:2] for ln in pack if ' ' in ln][0]
                    info['    '] = f'{year}-{month}-{day}'
                    continue

                if '    :' in line:
                    info['    '] = [ln for ln in pack if ln.isdigit()
                                    and len(ln) > 10][0]
                    continue

                if ' :' in line:
                    c1 = pack[idx].split(':')[1]
                    c2 = pack[idx+1]
                    c3 = pack[idx+2]
                    c4 = pack[idx+3]
                    info['   '] = f'{c1} {c2} {c3} {c4}'
                    continue

                if '   :' in line:
                    info['   '] = line.split(':')[1]
                    continue

                if '   :' in line:
                    info['   '] = line.split(':')[1]
                    continue

        return info

    def _extrace_from_table(self, table):
        """       """
        info = {}
        if len(table) != 4:
            return None

        #    
        for cell in table[0]:
            if not cell:
                continue

            lines = cell.splitlines()
            for line in lines:
                if '          :' in line:
                    info['     '] = line.split(':')[1]
                    continue

                if len(line) == 18 and line.isalnum():
                    info['     '] = line
                    continue

                if len(line) == 27:
                    if '  ' not in info:
                        info['  '] = []
                    info['  '].append(line)
                    continue

        #   
        for cell in table[1]:
            if not cell:
                continue

            lines = cell.splitlines()
            for line in lines:
                if '       、    ' in line:
                    info['  '] = lines[1:-1]
                    break

                if '    ' in line:
                    info['   '] = lines[-1][1:]
                    break

                if '    ' in line:
                    info['   '] = lines[-1][1:]
                    break

        #   
        for cell in table[2]:
            if not cell:
                continue

            lines = cell.splitlines()
            for line in lines:
                if '¥' in line:
                    info['  '] = line[1:]

        #    
        for cell in table[3]:
            if not cell:
                continue

            lines = cell.splitlines()
            for line in lines:
                if '          :' in line:
                    info['     '] = line.split(':')[1]
                    continue

                if len(line) == 18 and line.isalnum():
                    info['     '] = line
                    continue

        return info


if __name__ == '__main__':
    try:
        FapiaoShell().cmdloop()
    except KeyboardInterrupt:
        print('

  ！')
転載先:https://www.cnblogs.com/wuyaSama/p/10768002.html
僕のWebdriver(Python)
原生JS実現Promise