Python发票信息自动提取,日常办公中需要经常统计发票报销金额等信息,手动统计十分繁琐,基于Python实现发票图片的批量识别和统计,自动生成Excel统计表格,同时兼容pdf格式的发票。
系统运行环境
Python3.7+ 、百度发票识别API 、Tkinter简易界面
生成的统计信息有
开票日期, 纳税人识别号, 购买方名称, 卖方名称, 购买金额,发票号码
部分源码如下:
import os, sys
import xlwt
import tkinter as tk
from tkinter import *
import tkinter.filedialog as tkFileDialog
import tkinter.messagebox
BASEPATH = "你的工作路径"
增值税发票识别
'''
# 获取发票正文内容
def get_context(pic):
data = {}
try:
request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
# 二进制方式打开图片文件
f = open(pic, 'rb')
img = base64.b64encode(f.read())
params = {"image":img}
# 这里需要替换成自己的access_token
access_token = '你的Token'
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
if response:
# print (response.json())
json1 = response.json()
data['SellerRegisterNum'] = json1['words_result']['SellerRegisterNum']
data['InvoiceDate'] = json1['words_result']['InvoiceDate']
data['PurchasserName'] = json1['words_result']['PurchaserName']
data['SellerName'] = json1['words_result']['SellerName']
data['AmountInFiguers'] = json1['words_result']['AmountInFiguers']
data['InvoiceNum'] = json1['words_result']['InvoiceNum']
print(data['AmountInFiguers'])
#print('正文内容获取成功!')
return data
except Exception as e:
print(e)
return data
# 定义生成图片路径的函数
def pics(path):
#生成一个空列表用于存放图片路径
pics = []
# 遍历文件夹,找到后缀为jpg和png的文件,整理之后加入列表
for filename in os.listdir(path):
if filename.endswith('jpg') or filename.endswith('png'):
pic = path + '/' + filename
pics.append(pic)
return pics
# 定义一个获取文件夹内所有文件正文内容的函数,每次返回一个字典,把返回的所有字典存放在一个列表里
def get_datas(pics):
datas = []
for p in pics:
data = get_context(p)
datas.append(data)
return datas
# 定义一个写入将数据excel表格的函数
def save(datas):
print('正在写入数据!')
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('增值税发票内容登记', cell_overwrite_ok=True)
# 设置表头,这里可以根据自己的需求设置,我这里设置了5个
title = ['开票日期', '纳税人识别号', '购买方名称', '卖方名称', '购买金额','发票号码']
for i in range(len(title)):
sheet.write(0, i, title[i])
for d in range(len(datas)):
#print(datas[d])
for j in range(len(title)):
sheet.write(d + 1, 0, datas[d]['InvoiceDate'])
sheet.write(d + 1, 1, datas[d]['SellerRegisterNum'])
sheet.write(d + 1, 2, datas[d]['PurchasserName'])
sheet.write(d + 1, 3, datas[d]['SellerName'])
sheet.write(d + 1, 4, datas[d]['AmountInFiguers'])
sheet.write(d + 1, 5, datas[d]['InvoiceNum'])
#print('数据写入成功!')
book.save( BASEPATH +'/增值税发票.xls')


