Python读取PDF发票信息

读取方法：采用PDF转DOCX，解压DOCX使用xml读取word/document.xml文字，获取出所有文字类w:t节点nodeValue值，re过滤出发票内容。
Github 地址:：https://github.com/hefaxing/fapiao_read
仔细看config.ini，修改相应信息，测试运行几次就懂了。
各个方法代码展示（请勿直接复制运行，没有主体调用，请移步Github下载完整代码）：
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# author: CY
# Date: 2021-08-19
# QQ: 77061066
# Version: 1.0.210820.1700
#
# pip install pdf2docx==0.5.2
# pip install pyzbar==0.1.8
# pip install pandas==1.3.0
# pip install Pillow==8.3.1
# pip install frontend==0.0.3
# pip install openpyxl==3.0.7
#
# 获取PDF发票信息
# 相应配置查看config.ini，保证config.ini跟脚本在同一目录
# 读取方法：采用PDF转DOCX，解压DOCX使用xml读取word/document.xml文字，获取出所有文字类w:t节点nodeValue值，re过滤出发票内容。

import os
import ast
import re
import configparser
import fitz
import shutil
import pandas
from pdf2docx import Converter
from zipfile import ZipFile
from xml.dom.minidom import parseString
from PIL import Image
from pyzbar.pyzbar import decode

def Get_files(path, suffix):
    # 获取目录下所有匹配扩展名的文件
    # path : str ('C:\\Windows\\Temp')
    # suffix : str ('pdf')
    #
    #pdf_files = [os.path.join(root, file_name) for root, subdirs, file_names in os.walk(path) for file_name in file_names if file_name.endswith('.%s' % suffix)]

    pdf_files = []
    for root, subdirs, file_names in os.walk(path):
        for file_name in file_names:
            if file_name.endswith('.%s' % suffix):
                file_path = os.path.join(root, file_name)
                pdf_files.append(file_path)

    return pdf_files

def Filter_str(filter_str):
    # 过滤不必要的字符
    # filter_str : str ('采用PDF转DOCX，解压DOCX使用xml读取word/document.xml文字，获取出所有文字类w:t节点nodeValue值，re过滤出发票内容。')

    filter_str = filter_str.replace(' ', '')
    filter_str = filter_str.replace('　', '')
    filter_str = filter_str.replace(':', '')
    filter_str = filter_str.replace('：', '')
    filter_str = filter_str.replace('￥', '')
    filter_str = filter_str.replace('¥', '')
    filter_str = filter_str.replace('（', '(')
    filter_str = filter_str.replace('）', ')')
    return filter_str

def Read_QR_code(img_file):
    # 读取图片中的二维码信息
    # img_file : str ('C:\\Windows\\Temp\\temp.png')

    qr_code = {'qr_code_code': '', 'qr_code_number': '', 'qr_code_total': '', 'qr_code_date': '', 'qr_code_check_code': ''}
    img = Image.open(img_file)
    barcodes = decode(img)
    if barcodes is not None:
        qr_code_data = barcodes[0].data.decode("utf-8")
        qr_code_date = qr_code_data.split(',')
        qr_code['qr_code_code'] = qr_code_date[2]
        qr_code['qr_code_number'] = qr_code_date[3]
        qr_code['qr_code_total'] = qr_code_date[4]
        qr_code['qr_code_date'] = qr_code_date[5]
        qr_code['qr_code_check_code'] = qr_code_date[6]
    return qr_code

def From_pdf_to_png(pdf_file, temp_png_file):
    # pdf转png
    # pdf_file : str ('C:\\Windows\\Temp\\temp.pdf')
    # temp_png_file : str ('C:\\Windows\\Temp\\temp.png')

    doc = fitz.open(pdf_file)
    page = doc.loadPage(0)
    trans = fitz.Matrix(2, 2)
    pix = page.getPixmap(matrix=trans, alpha=False)
    pix.writePNG(temp_png_file)
    doc.close()
    return temp_png_file

def From_pdf_to_docx(pdf_file, docx_file):
    # pdf转docx
    # pdf_file : str ('C:\\Windows\\Temp\\temp.pdf')
    # docx_file : str ('C:\\Windows\\Temp\\temp.docx')

    cv = Converter(pdf_file)

    # 默认参数start=0, end=None表示转换所有页面
    cv.convert(docx_file)
    cv.close()
    return docx_file

def Read_docx(docx_file):
    # 解压docx文档
    # docx_file : str ('C:\\Windows\\Temp\\temp.docx')

    zf = ZipFile(docx_file)
    # 查看所有文件名字
    #for item in zf.filelist:
    #    print(item.filename)

    # 读取word/document.xml内容
    myfile = zf.open('word/document.xml')
    xml_str = myfile.read()
    zf.close()

    #collection = DOMTree.documentElement
    #print('collection属性',collection.nodeName,collection.nodeValue,collection.nodeType)
    
    # 解析xml内容
    DOMTree = parseString(xml_str)
    collection = DOMTree.documentElement
    
    # 获取所有'w:t'节点
    w_t = collection.getElementsByTagName('w:t')
    node_text = []
    for w_t_node in w_t:
        for node in w_t_node.childNodes:
            node_text.append(Filter_str(node.nodeValue))
    
    #print(node_text)
    return node_text

def Get_fapiao_info(text_info):
    # 匹配发票信息
    # text_info : list (['发票代码', '888888888888', '发票号码', '88888888', ...])

    text_info = ''.join(text_info)
    text_info2 = text_info.split('密码区')[-1]
    #print(text_info)

    # 此re匹配报错，修改匹配规则，或如不必要，可注释对应行
    fapiao_info = {}
    fapiao_info['fapiao_code'] = re.findall("发票代码(\d+)", text_info)[0]
    fapiao_info['fapiao_number'] = re.findall("发票号码(\d+)", text_info)[0]
    fapiao_info['fapiao_date'] = re.findall("开票日期(.*?)校验码", text_info)[0]
    fapiao_info['fapiao_check_code'] = re.findall("校验码(\d+)", text_info)[0]
    fapiao_info['fapiao_buyer_name'] = re.findall("购买方名称(.*?)纳税人识别号", text_info)[0]
    fapiao_info['fapiao_buyer_tax_number'] = re.findall("购买方名称.*?纳税人识别号(.*?)地址", text_info)[0]
    fapiao_info['fapiao_goods'] = re.findall("服务名称(.*?)合计规格型号", text_info)[0]
    fapiao_info['fapiao_s_tax_total'] = re.findall("\(大写\)(.*?)\(小写\)", text_info)[0]
    fapiao_info['fapiao_tax_total'] = re.findall("\(小写\)(\d+.\d+)", text_info)[0]
    fapiao_info['fapiao_seller_name'] = re.findall("销售方名称(.*?)纳税人识别号", text_info2)[0]
    fapiao_info['fapiao_seller_tax_number'] = re.findall("销售方名称.*?纳税人识别号(.*?)地址", text_info2)[0]
    fapiao_info['fapiao_address_phone'] = re.findall("电话(.*?)开户行", text_info2)[0]
    fapiao_info['fapiao_bank_name'] = re.findall("开户行及账号(.*?)备注", text_info2)[0]

    #print('发票代码 %s' % fapiao_info['fapiao_code'])
    #print('发票号码 %s' % fapiao_info['fapiao_number'])
    #print('开票日期 %s' % fapiao_info['fapiao_date'])
    #print('校验码 %s' % fapiao_info['fapiao_check_code'])
    #print('购买方名称 %s' % fapiao_info['fapiao_buyer_name'])
    #print('购买方纳税人识别号 %s' % fapiao_info['fapiao_buyer_tax_number'])
    #print('服务名称 %s' % fapiao_info['fapiao_goods'])
    #print('价税合计（大写） %s' % fapiao_info['fapiao_tax_total'])
    #print('价税合计（小写） %s' % fapiao_info['fapiao_tax_total'])
    #print('销售方名称 %s' % fapiao_info['fapiao_seller_name'])
    #print('销售方纳税人识别号 %s' % fapiao_info['fapiao_seller_tax_number'])
    #print('地址、电话 %s' % fapiao_info['fapiao_address_phone'])
    #print('开户行及账号 %s' % fapiao_info['fapiao_bank_name'])
    #print('=' * 50)
    return fapiao_info

def Save_txt(text_info, out_file):
    # 把获取的信息保存到txt文档
    # text_info : str or list ('发票代码888888888888发票号码88888888...' or ['发票代码', '888888888888', '发票号码', '88888888', ...])
    # out_file : str ('C:\\Windows\\Temp\\temp.txt')

    if isinstance(text_info, list):
        out_text = ''.join(text_info).encode('utf-8')
    elif isinstance(text_info, str):
        out_text = text_info.encode('utf-8')
    # 保存读取的信息
    with open(out_file, 'ab') as file_object:
        file_object.write(b"%s\n" % out_text)
        file_object.close()
    return out_file

def Save_xlsx(text_info, out_file):
    # 保存发票信息到EXCEL文档
    # text_info : list ([{'发票代码': '888888888888', '发票号码': '88888888', ...}, {'发票代码': '888888888888', '发票号码': '88888888', ...}])
    # out_file : str ('C:\\Windows\\Temp\\temp.xlsx')

    pf = pandas.DataFrame(text_info)

    # 指定列的顺序
    order = ["发票代码", "发票号码", "开票日期", "校验码", "购买方名称", '购买方纳税人识别号', "价税合计", "服务名称", "销售方名称", "销售方纳税人识别号", "销售方地址、电话", "销售方开户行及账号"]
    pf = pf[order]
    # 打开excel文件
    file_path = pandas.ExcelWriter(out_file)
    # 替换空单元格
    pf.fillna(' ', inplace=True)
    # 输出
    pf.to_excel(file_path, encoding='utf-8', index=False, sheet_name="sheet1")
    file_path.save()
    return out_file

def Filter_name(f_text):
    # 过滤企业名字头和尾
    # f_text : str (XXXX有限公司)

    try:
        filter_name = "START%sEND" % re.findall('.*?公司', f_text)[0]
    except:
        filter_name = "START%sEND" % f_text
    for filter_s in cn_region:
        filter_name = filter_name.replace("START%s市" % filter_s, '')
    for filter_s in cn_region:
        filter_name = filter_name.replace("START%s" % filter_s, '')
    for filter_s in company_name_filter:
        filter_name = filter_name.replace("%sEND" % filter_s, '')
    for filter_s in region_filter:
        filter_name = filter_name.replace("START%s" % filter_s, '')
    filter_name = filter_name.replace("START", '')
    filter_name = filter_name.replace("END", '')
    return filter_name

def Filter_goods(f_text):
    # 过滤服务名称，剔除第一项
    # f_text : str ('*热爱祖国*服务人民*崇尚科学*辛勤劳动*团结互助*诚实守信*遵纪守法*艰苦奋斗')

    if '*' in f_text:
        # *号分隔，截取第三个之后的
        filter_name = f_text.split('*')[2:]
        if filter_name == []:
            filter_name = f_text.split('*')[1:]
        filter_name = ''.join(filter_name)
    else:
        filter_name = f_text
    # 过滤符号
    filter_name = re.sub('\W+', '', filter_name).replace("_", '')
    # 截取前十个字符
    filter_name = filter_name[:10]
    return filter_name

def Clear_temp_file(temp_file):
    # 清理临时文件
    # temp_file : str ('C:\\Windows\\Temp\\temp.txt')

    os.remove(temp_file)

def Is_exists(is_path, is_type):
    # 判断目录是否存在
    # in_path : str ('C:\\Windows\\Temp')
    # is_type : int (0: 目录不存在抛出异常，1: 目录不存在则创建)

    is_exists = os.path.exists(is_path)

    if is_exists:
        return is_path
    
    if is_type == 0:
        raise UserWarning(u"config: %s 不存在此目录 " % is_path)
    elif is_type == 1:
        os.makedirs(is_path)

    return is_path

def New_file_name(text_info, file_format, file_join):
    # 生成新文件名字
    # text_info : list ([{'发票代码': '888888888888', '发票号码': '88888888', ...}, {'发票代码': '888888888888', '发票号码': '88888888', ...}])
    # file_format : list (['购买方名称', '发票号码', '开票日期', '服务名称', '价税合计', '销售方名称'])
    # file_join : str ('-')

    file_format_list = []
    for order_name in [fapiao_order[s_name] for s_name in file_format]:
        if order_name == 'fapiao_buyer_name':
            filter_info = Filter_name(text_info[order_name])
            if filter_info in company_name.keys():
                filter_info = company_name[buyer_name]
        elif order_name == 'fapiao_seller_name':
            filter_info = Filter_name(text_info[order_name])
        elif order_name == 'fapiao_goods':
            filter_info = Filter_goods(text_info[order_name])
        else:
            filter_info = text_info[order_name]
        file_format_list.append(filter_info)

    return file_join.join(file_format_list)
原创文章，作者：cy，如若转载，请注明出处：https://www.cygzs.net/