電子請求書情報抽出ツール(Python)を共有

7308 ワード

電子領収書が多すぎて、総額を統計するのは異常に困難で、ネット上のツールは使いにくくて、2時間かけて1部を実現して、中石油、京東が発行した電子領収書をテストしたことがあってもいいですが、一部の領収書の名前が失敗しても統計に影響しません.必要な仲間は自分で持って行って直しましょう.

import cmd
import sys
import json
import pdfplumber
import os
from pprint import pprint


class FapiaoShell(cmd.Cmd):
    """    """

    intro = '          ,  ?(help)           ,CTRL+C    。
' prompt = '
: ' doc_header = " ( help ):" misc_header = " :" undoc_header = " :" nohelp = "*** (%s) " def __init__(self): super().__init__() def do_load(self, arg): """ :load D:\ """ if not os.path.isdir(arg): print(' !') return os.chdir(os.path.dirname(arg)) pdfs = [] for root, _, files in os.walk(arg): for fn in files: ext = os.path.splitext(fn)[1].lower() if ext != '.pdf': continue fpth = os.path.join(root, fn) fpth = os.path.relpath(fpth) print(f' pdf : {fpth}') pdfs.append(fpth) pdf_ctxs = self._parse_pdfs(pdfs) total = { ' ': pdf_ctxs, ' ': len(pdf_ctxs), ' ': 0, } for fpth, info in pdf_ctxs: total[' '] += float(info[' ']) print('
.json...') with open(" .json", 'w', encoding='utf-8') as json_file: json.dump(total, json_file, ensure_ascii=False, sort_keys=True, indent=4, separators=(', ', ': ')) print(' !') def _parse_pdfs(self, pdfs): """ """ result = [] for fpth in pdfs: info = {} with pdfplumber.open(fpth) as pdf: page = pdf.pages[0] if ' ' not in ''.join(page.extract_text()): result.append((fpth, {})) inf = self._extrace_from_words(page.extract_words()) info.update(inf) inf = self._extrace_from_table(page.extract_tables()[0]) info.update(inf) result.append((fpth, info)) return result def _extrace_from_words(self, words): """ """ info = {} lines = {} for word in words: top = int(word['top']) bottom = int(word['bottom']) pos = (top + bottom) // 2 text = word['text'] if pos not in lines: lines[pos] = [text] else: lines[pos].append(text) lines_pack = [] last_pos = None for pos in sorted(lines): arr = lines[pos] if len(lines_pack) > 0 and pos - last_pos <= 10: lines_pack[-1] += arr continue lines_pack.append(arr) last_pos = pos continue for pack in lines_pack: for idx, line in enumerate(pack): if ' ' in line: info[' '] = line continue if ' :' in line: info[' '] = line.split(':')[1] continue if ' :' in line: info[' '] = line.split(':')[1] continue if ' :' in line: year = line.split(':')[1] month = [ln for ln in pack if ln.isdigit()][0] day = [ln[:2] for ln in pack if ' ' in ln][0] info[' '] = f'{year}-{month}-{day}' continue if ' :' in line: info[' '] = [ln for ln in pack if ln.isdigit() and len(ln) > 10][0] continue if ' :' in line: c1 = pack[idx].split(':')[1] c2 = pack[idx+1] c3 = pack[idx+2] c4 = pack[idx+3] info[' '] = f'{c1} {c2} {c3} {c4}' continue if ' :' in line: info[' '] = line.split(':')[1] continue if ' :' in line: info[' '] = line.split(':')[1] continue return info def _extrace_from_table(self, table): """ """ info = {} if len(table) != 4: return None # for cell in table[0]: if not cell: continue lines = cell.splitlines() for line in lines: if ' :' in line: info[' '] = line.split(':')[1] continue if len(line) == 18 and line.isalnum(): info[' '] = line continue if len(line) == 27: if ' ' not in info: info[' '] = [] info[' '].append(line) continue # for cell in table[1]: if not cell: continue lines = cell.splitlines() for line in lines: if ' 、 ' in line: info[' '] = lines[1:-1] break if ' ' in line: info[' '] = lines[-1][1:] break if ' ' in line: info[' '] = lines[-1][1:] break # for cell in table[2]: if not cell: continue lines = cell.splitlines() for line in lines: if '¥' in line: info[' '] = line[1:] # for cell in table[3]: if not cell: continue lines = cell.splitlines() for line in lines: if ' :' in line: info[' '] = line.split(':')[1] continue if len(line) == 18 and line.isalnum(): info[' '] = line continue return info if __name__ == '__main__': try: FapiaoShell().cmdloop() except KeyboardInterrupt: print('

!')

転載先:https://www.cnblogs.com/wuyaSama/p/10768002.html