Python发票信息自动提取,日常办公中需要经常统计发票报销金额等信息,手动统计十分繁琐,基于Python实现发票图片的批量识别和统计,自动生成Excel统计表格,同时兼容pdf格式的发票。
系统运行环境
Python3.7+ 、百度发票识别API 、Tkinter简易界面
生成的统计信息有
开票日期, 纳税人识别号, 购买方名称, 卖方名称, 购买金额,发票号码
部分源码如下:
import os, sys import xlwt import tkinter as tk from tkinter import * import tkinter.filedialog as tkFileDialog import tkinter.messagebox BASEPATH = "你的工作路径" 增值税发票识别 ''' # 获取发票正文内容 def get_context(pic): data = {} try: request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice" # 二进制方式打开图片文件 f = open(pic, 'rb') img = base64.b64encode(f.read()) params = {"image":img} # 这里需要替换成自己的access_token access_token = '你的Token' request_url = request_url + "?access_token=" + access_token headers = {'content-type': 'application/x-www-form-urlencoded'} response = requests.post(request_url, data=params, headers=headers) if response: # print (response.json()) json1 = response.json() data['SellerRegisterNum'] = json1['words_result']['SellerRegisterNum'] data['InvoiceDate'] = json1['words_result']['InvoiceDate'] data['PurchasserName'] = json1['words_result']['PurchaserName'] data['SellerName'] = json1['words_result']['SellerName'] data['AmountInFiguers'] = json1['words_result']['AmountInFiguers'] data['InvoiceNum'] = json1['words_result']['InvoiceNum'] print(data['AmountInFiguers']) #print('正文内容获取成功!') return data except Exception as e: print(e) return data # 定义生成图片路径的函数 def pics(path): #生成一个空列表用于存放图片路径 pics = [] # 遍历文件夹,找到后缀为jpg和png的文件,整理之后加入列表 for filename in os.listdir(path): if filename.endswith('jpg') or filename.endswith('png'): pic = path + '/' + filename pics.append(pic) return pics # 定义一个获取文件夹内所有文件正文内容的函数,每次返回一个字典,把返回的所有字典存放在一个列表里 def get_datas(pics): datas = [] for p in pics: data = get_context(p) datas.append(data) return datas # 定义一个写入将数据excel表格的函数 def save(datas): print('正在写入数据!') book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('增值税发票内容登记', cell_overwrite_ok=True) # 设置表头,这里可以根据自己的需求设置,我这里设置了5个 title = ['开票日期', '纳税人识别号', '购买方名称', '卖方名称', '购买金额','发票号码'] for i in range(len(title)): sheet.write(0, i, title[i]) for d in range(len(datas)): #print(datas[d]) for j in range(len(title)): sheet.write(d + 1, 0, datas[d]['InvoiceDate']) sheet.write(d + 1, 1, datas[d]['SellerRegisterNum']) sheet.write(d + 1, 2, datas[d]['PurchasserName']) sheet.write(d + 1, 3, datas[d]['SellerName']) sheet.write(d + 1, 4, datas[d]['AmountInFiguers']) sheet.write(d + 1, 5, datas[d]['InvoiceNum']) #print('数据写入成功!') book.save( BASEPATH +'/增值税发票.xls')