電子請求書情報抽出ツール(Python)を共有
7308 ワード
電子領収書が多すぎて、総額を統計するのは異常に困難で、ネット上のツールは使いにくくて、2時間かけて1部を実現して、中石油、京東が発行した電子領収書をテストしたことがあってもいいですが、一部の領収書の名前が失敗しても統計に影響しません.必要な仲間は自分で持って行って直しましょう.
転載先:https://www.cnblogs.com/wuyaSama/p/10768002.html
import cmd
import sys
import json
import pdfplumber
import os
from pprint import pprint
class FapiaoShell(cmd.Cmd):
""" """
intro = ' , ?(help) ,CTRL+C 。
'
prompt = '
: '
doc_header = " ( help ):"
misc_header = " :"
undoc_header = " :"
nohelp = "*** (%s) "
def __init__(self):
super().__init__()
def do_load(self, arg):
""" :load D:\ """
if not os.path.isdir(arg):
print(' !')
return
os.chdir(os.path.dirname(arg))
pdfs = []
for root, _, files in os.walk(arg):
for fn in files:
ext = os.path.splitext(fn)[1].lower()
if ext != '.pdf':
continue
fpth = os.path.join(root, fn)
fpth = os.path.relpath(fpth)
print(f' pdf : {fpth}')
pdfs.append(fpth)
pdf_ctxs = self._parse_pdfs(pdfs)
total = {
' ': pdf_ctxs,
' ': len(pdf_ctxs),
' ': 0,
}
for fpth, info in pdf_ctxs:
total[' '] += float(info[' '])
print('
.json...')
with open(" .json", 'w', encoding='utf-8') as json_file:
json.dump(total,
json_file,
ensure_ascii=False,
sort_keys=True,
indent=4,
separators=(', ', ': '))
print(' !')
def _parse_pdfs(self, pdfs):
""" """
result = []
for fpth in pdfs:
info = {}
with pdfplumber.open(fpth) as pdf:
page = pdf.pages[0]
if ' ' not in ''.join(page.extract_text()):
result.append((fpth, {}))
inf = self._extrace_from_words(page.extract_words())
info.update(inf)
inf = self._extrace_from_table(page.extract_tables()[0])
info.update(inf)
result.append((fpth, info))
return result
def _extrace_from_words(self, words):
""" """
info = {}
lines = {}
for word in words:
top = int(word['top'])
bottom = int(word['bottom'])
pos = (top + bottom) // 2
text = word['text']
if pos not in lines:
lines[pos] = [text]
else:
lines[pos].append(text)
lines_pack = []
last_pos = None
for pos in sorted(lines):
arr = lines[pos]
if len(lines_pack) > 0 and pos - last_pos <= 10:
lines_pack[-1] += arr
continue
lines_pack.append(arr)
last_pos = pos
continue
for pack in lines_pack:
for idx, line in enumerate(pack):
if ' ' in line:
info[' '] = line
continue
if ' :' in line:
info[' '] = line.split(':')[1]
continue
if ' :' in line:
info[' '] = line.split(':')[1]
continue
if ' :' in line:
year = line.split(':')[1]
month = [ln for ln in pack if ln.isdigit()][0]
day = [ln[:2] for ln in pack if ' ' in ln][0]
info[' '] = f'{year}-{month}-{day}'
continue
if ' :' in line:
info[' '] = [ln for ln in pack if ln.isdigit()
and len(ln) > 10][0]
continue
if ' :' in line:
c1 = pack[idx].split(':')[1]
c2 = pack[idx+1]
c3 = pack[idx+2]
c4 = pack[idx+3]
info[' '] = f'{c1} {c2} {c3} {c4}'
continue
if ' :' in line:
info[' '] = line.split(':')[1]
continue
if ' :' in line:
info[' '] = line.split(':')[1]
continue
return info
def _extrace_from_table(self, table):
""" """
info = {}
if len(table) != 4:
return None
#
for cell in table[0]:
if not cell:
continue
lines = cell.splitlines()
for line in lines:
if ' :' in line:
info[' '] = line.split(':')[1]
continue
if len(line) == 18 and line.isalnum():
info[' '] = line
continue
if len(line) == 27:
if ' ' not in info:
info[' '] = []
info[' '].append(line)
continue
#
for cell in table[1]:
if not cell:
continue
lines = cell.splitlines()
for line in lines:
if ' 、 ' in line:
info[' '] = lines[1:-1]
break
if ' ' in line:
info[' '] = lines[-1][1:]
break
if ' ' in line:
info[' '] = lines[-1][1:]
break
#
for cell in table[2]:
if not cell:
continue
lines = cell.splitlines()
for line in lines:
if '¥' in line:
info[' '] = line[1:]
#
for cell in table[3]:
if not cell:
continue
lines = cell.splitlines()
for line in lines:
if ' :' in line:
info[' '] = line.split(':')[1]
continue
if len(line) == 18 and line.isalnum():
info[' '] = line
continue
return info
if __name__ == '__main__':
try:
FapiaoShell().cmdloop()
except KeyboardInterrupt:
print('
!')
転載先:https://www.cnblogs.com/wuyaSama/p/10768002.html