提取第三张报告的各种信息。使用正则和xpath方法。



# coding=utf8

import re,json,os
from lxml import etree
from collections import OrderedDict
from common import html,LoggerUntil,handle_parse_exception
from html_processor import  HtmlProcessor

logger = LoggerUntil(name="crcc_paser").getlog(logfilename='crcc_paser.log',loglevel=2,add_StreamHandler=1)
class HtmlFileMixin(object):
    def save_to_file(self):
        with open(self.create_file(),'wb') as f:
          f.write(self.html)

    def create_file(self):
        if not os.path.exists('htmldir'):
            os.mkdir('htmldir')
        html_file_name = 'htmldir' + '/' + self.name + '.html'
        return  html_file_name


class CrccPaser(HtmlFileMixin):
    def __init__(self,html,name):
        self.html = html
        self.name = name
        self.data = OrderedDict()
        self.data['name'] = name
        self.selector = None
        self.text =  self._get_text()
        self.get_selector()

    def _get_text(self):
        text = self.html.decode('utf8')
        return text

    def get_selector(self):
        self.selector = etree.HTML(self.text)

    def extract_user_info(self):
        self.data['report_no'] = re.search(u'报告编号:(.*?)</strong>', self.text).group(1).strip()
        self.data['query_time'] = re.search(u'查询时间:(.*?)</strong>', self.text).group(1).strip()
        self.data['report_time'] = re.search(u'报告时间:(.*?)</strong>', self.text).group(1).strip()
        self.data['crcc_name'] = re.search(u'姓名:(.*?)</strong>', self.text).group(1).strip()
        self.data['id_type'] = re.search(u'证件类型:(.*?)</strong>', self.text).group(1).strip()
        self.data['id_no'] = re.search(u'证件号码:(.*?)</strong>', self.text).group(1).strip()


    def extract_summary_information(self):
        account_num = re.search(
            u'<tr>\s*?<td align="left" class="p">\s*? 账户数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>',
            self.text).groups()
        self.data['account_num'] = self._init_num_dict(account_num)

        uncleared_num = re.search(
                u'<tr>\s*?<td align="left" class="p">\s*?   未结清/未销户账户数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>',
                self.text).groups()
        self.data['uncleared_num'] = self._init_num_dict(uncleared_num)

        overdue_num = re.search(
                u'<tr>\s*?<td align="left" class="p">\s*? 发生过逾期的账户数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>',
                self.text).groups()
        self.data['overdue_num'] = self._init_num_dict(overdue_num)

        overdue90_num = re.search(
                u'<tr>\s*?<td align="left" class="p">\s*?   发生过90天以上逾期的账户数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>',
                self.text).groups()
        self.data['overdue90_num'] = self._init_num_dict(overdue90_num)

        assure_num = re.search(
                u'<tr>\s*?<td align="left" class="p">\s*? 为他人担保笔数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>',
                self.text).groups()
        self.data['assure_num'] = self._init_num_dict(assure_num)

    @staticmethod
    def _init_num_dict(num_tuple):
        num_dict = {}
        num_dict['credit_card'],num_dict[' home_loans'],num_dict['other_loans'] = num_tuple
        return  num_dict

    def extract_all_loan_information(self):
        all_loan_information = re.findall(u'<li\s*?style="list-style-type: decimal; list-style-position: outside">\s*?(\S*?)\s*?</li>', self.text)
        self.data['all_loan_information'] = all_loan_information

    def extract_public_records(self):
        if not  re.search(u'系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。',self.text):
            public_records=self.selector.xpath('//table[@align="center"]//table[5]/tbody/tr[3]/td')[0].strip()   # 还不能确定具体格式,有可能造成解析中断出错   #TODO
        else:
            public_records=[u'系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。']
        self.data['public_records'] = public_records


    def extract_query_records(self):
        if  not  re.search(u'系统中没有您的信用报告最近2年被查询的记录。',self.text):
            #query_records=selector.xpath('//table[@align="center"]//table[6]/tbody/tr[3]/td')[0].strip()
            query_records = re.findall(
                u'<tr align="center">[\s\S]*?td class="p">\s*(.*?)\s*?</td>[\s\S]*?<td class="p">\s*(.*?日)\s*?</td>[\s\S]*?<td class="p">\s*(.*?)\s*?</td>[\s\S]*?<td class="p">\s*(.*?)\s*?</td>[\s\S]*?</tr>',
                self.text)
            query_records = self._init_query_records(query_records)
        else:
            query_records =[u'系统中没有您的信用报告最近2年被查询的记录。']

        self.data['query_records'] = query_records


    @staticmethod
    def _init_query_records(query_records):
        """
        :type query_records : list
        """
        query_records_list = []
        for record_tuple in query_records:
            query_record_dict = OrderedDict()
            query_record_dict['no'], query_record_dict['query_date'], query_record_dict['query_person'],query_record_dict['query_reason']  = record_tuple
            query_records_list.append(query_record_dict)
        return query_records_list

    @handle_parse_exception
    def extract_all(self):
        self.extract_user_info()
        self.extract_summary_information()
        self.extract_all_loan_information()
        self.extract_public_records()
        self.extract_query_records()



def extract_crcc(html_str,name):

    htmlProcessor = HtmlProcessor(html_str,name)
    htmlProcessor.save_to_file()   # 保存html文件


    crccPaser = CrccPaser(html_str, name)
    # crccPaser.save_to_file()
    crccPaser.extract_all()
    logger.info(json.dumps(crccPaser.data, ensure_ascii=False)) # TODO
    return json.dumps(crccPaser.data, ensure_ascii=False)

if __name__ == '__main__':
    extract_crcc(html,'小明5')



其中html第三张报告的页面源码字符串。

结果是



{"name": "小明5", "report_no": "2017122200004891965680", "query_time": "2017.12.22 11:12:32", "report_time": "2017.12.22 18:38:18", "crcc_name": "小明5", "id_type": "身份证", "id_no": "**************4337", "account_num": {" home_loans": "0", "other_loans": "2", "credit_card": "0"}, "uncleared_num": {" home_loans": "0", "other_loans": "0", "credit_card": "0"}, "overdue_num": {" home_loans": "0", "other_loans": "0", "credit_card": "0"}, "overdue90_num": {" home_loans": "0", "other_loans": "0", "credit_card": "0"}, "assure_num": {" home_loans": "0", "other_loans": "0", "credit_card": "0"}, "all_loan_information": ["2012年8月23日国家开发银行湖北省分行发放的6,000元(人民币)个人助学贷款,2014年10月已结清。", "2011年11月19日国家开发银行湖北省分行发放的6,000元(人民币)个人助学贷款,2014年10月已结清。"], "public_records": ["系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。"], "query_records": [{"no": "1", "query_date": "2017年12月4日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}, {"no": "2", "query_date": "2017年11月20日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}, {"no": "3", "query_date": "2017年11月6日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}, {"no": "4", "query_date": "2017年10月20日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}, {"no": "5", "query_date": "2017年10月10日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}, {"no": "6", "query_date": "2017年9月27日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}, {"no": "7", "query_date": "2017年9月18日", "query_person": "本人", "query_reason": "本人查询(互联网个人信用信息服务平台)"}]}



可以发送到后端py java提取,也可以在webview客户端提取,客户端提取js代码如下。



/**
 * Created by wj49476 on 201/3/20.
 */

function extractReport() {

    //消除空格
    String.prototype.trim = function()
    {
    return this.replace(/(^\s*)|(\s*$)/g, "");
    };


    //正则防止出错
    String.prototype.match2 = function(regObj) {
        var matchArray = this.match(regObj);
        if (matchArray && matchArray.length == 2){
            return matchArray[1];
        }
        else{
            return '';
        }
    };

    //数组取下标,使值不为undefined
    Array.prototype.get = function(n) {
        value = this[n];
        if (value === undefined){
            console.debug("取下标错误");
            value = '';
        }
        return value;
    };


    //css选择器
    function getInnerText( cssselector){
        var element = document.querySelector(cssselector);
        if(element){
            return element.innerText;
        }
        else {
            console.warn("没有找到 " + cssselector + " 的元素");
            return "";
        }
    }

    var data = {};
    data['SummaryInformation'] = {};
    var htmlStr = document.body.outerHTML;


    function  extractUserInfo() {
        data["reportNo"] = htmlStr.match2(/报告编号:(.*?)<\/strong>/);
        data["queryTime"] = htmlStr.match2(/查询时间:(.*?)<\/strong>/);
        data['reportTime'] = htmlStr.match2(/报告时间:(.*?)<\/strong>/);
        data['crccName'] = htmlStr.match2(/姓名:(.*?)<\/strong>/);
        data['idType'] = htmlStr.match2(/证件类型:(.*?)<\/strong>/);
        data['idNo'] = htmlStr.match2(/证件号码:(.*?)<\/strong>/);
    }

    function extractSummaryInformation() {
        var accountNum = htmlStr.match(/<tr>\s*?<td align="left" class="p">\s*? 账户数\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<\/tr>/);
        data['SummaryInformation']['accountNum'] = _initNumDict(accountNum);

        var unclearedNum = htmlStr.match(/<tr>\s*?<td align="left" class="p">\s*?   未结清\/未销户账户数\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<\/tr>/);
        data['SummaryInformation']['unclearedNum'] = _initNumDict(unclearedNum);

        var overdueNum = htmlStr.match(/<tr>\s*?<td align="left" class="p">\s*? 发生过逾期的账户数\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<\/tr>/);
        data['SummaryInformation']['overdueNum'] = _initNumDict(overdueNum);

        var overdue90Num = htmlStr.match(/<tr>\s*?<td align="left" class="p">\s*?   发生过90天以上逾期的账户数\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<\/tr>/);
        data['SummaryInformation']['overdue90Num'] = _initNumDict(overdue90Num);

        var assureNum = htmlStr.match(/<tr>\s*?<td align="left" class="p">\s*? 为他人担保笔数\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<td align="center" class="p">\s*(.*?)\s*?<\/td>\s*?<\/tr>/);
        data['SummaryInformation']['assureNum'] = _initNumDict(assureNum);

    }

    function  _initNumDict(array) {
        numDict = {};
        numDict["creditCard"] = array.get(1);
        numDict["homeLoans"] = array.get(2);
        numDict["othreLoans"] = array.get(3);
        return numDict;
    }

    function extractAllLoanInformation(){
        var allLoanInformationG = htmlStr.match(/<li\s*?style="list-style-type: decimal; list-style-position: outside">\s*?(\S*?)\s*?<\/li>/g);
        var allLoanInformation = [];
        for (var i=0; i< allLoanInformationG.length; i++){
            allLoanInformation.push(allLoanInformationG[i].match(/<li\s*?style="list-style-type: decimal; list-style-position: outside">\s*?(\S*?)\s*?<\/li>/).get(1));
        }
        data['allLoanInformation'] = allLoanInformation;
    }

    function extractPublicRecords() {
        var publicRecords ;
        if (!(htmlStr.match(/系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。/))){    //没有具体的东西,可能不确定。
            publicRecords = getInnerText('body > div > div > table > tbody > tr:nth-child(2) > td > table:nth-child(11)');

        }else{
            publicRecords='系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。';
        }
        data['publicRecords'] = publicRecords;
    }

    function extractQueryRecords() {
        if (!(htmlStr.match(/系统中没有您的信用报告最近2年被查询的记录。/))){
            queryRecordsG = htmlStr.match(/<tr align="center">[\s\S]*?td class="p">\s*(.*?)\s*?<\/td>[\s\S]*?<td class="p">\s*(.*?日)\s*?<\/td>[\s\S]*?<td class="p">\s*(.*?)\s*?<\/td>[\s\S]*?<td class="p">\s*(.*?)\s*?<\/td>[\s\S]*?<\/tr>/g);
            //console.debug(queryRecordsG);
            queryRecords = [];
            for (var i=0; i<queryRecordsG.length; i++){
                queryRecords.push(queryRecordsG[i].match(/<tr align="center">[\s\S]*?td class="p">\s*(.*?)\s*?<\/td>[\s\S]*?<td class="p">\s*(.*?日)\s*?<\/td>[\s\S]*?<td class="p">\s*(.*?)\s*?<\/td>[\s\S]*?<td class="p">\s*(.*?)\s*?<\/td>[\s\S]*?<\/tr>/));
            }
            //console.debug(queryRecords);
            data['queryRecords'] = _initQueryRecords(queryRecords);
        }else{
            data['queryRecords'] = ['系统中没有您的信用报告最近2年被查询的记录。'];
        }

    }

    function _initQueryRecords(queryRecords) {
        queryRecordsArray = [];
        for (var i=0; i<queryRecords.length; i++){
            queryRecordDict = {};
            queryRecordDict['no'] = queryRecords[i].get(1);
            queryRecordDict['queryDate'] = queryRecords[i].get(2);
            queryRecordDict['queryPerson'] = queryRecords[i].get(3);
            queryRecordDict['queryReason'] = queryRecords[i].get(4);
            queryRecordsArray.push(queryRecordDict);
        }
        return queryRecordsArray;
    }

    function  extractReportInner() {
        extractUserInfo();
        extractSummaryInformation();
        extractAllLoanInformation();
        extractPublicRecords();
        extractQueryRecords();
    }
    extractReportInner();
    data['htmlStr'] = Base64.encode(htmlStr);
    return JSON.stringify(data);
}


var Base64 = {

    // private property
    _keyStr: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=",

    // public method for encoding
    encode: function(input) {
        var output = "";
        var chr1, chr2, chr3, enc1, enc2, enc3, enc4;
        var i = 0;

        input = Base64._utf8_encode(input);

        while (i < input.length) {

            chr1 = input.charCodeAt(i++);
            chr2 = input.charCodeAt(i++);
            chr3 = input.charCodeAt(i++);

            enc1 = chr1 >> 2;
            enc2 = ((chr1 & 3) << 4) | (chr2 >> 4);
            enc3 = ((chr2 & 15) << 2) | (chr3 >> 6);
            enc4 = chr3 & 63;

            if (isNaN(chr2)) {
                enc3 = enc4 = 64;
            } else if (isNaN(chr3)) {
                enc4 = 64;
            }

            output = output + this._keyStr.charAt(enc1) + this._keyStr.charAt(enc2) + this._keyStr.charAt(enc3) + this._keyStr.charAt(enc4);

        }

        return output;
    },

    // public method for decoding
    decode: function(input) {
        var output = "";
        var chr1, chr2, chr3;
        var enc1, enc2, enc3, enc4;
        var i = 0;

        input = input.replace(/[^A-Za-z0-9\+\/\=]/g, "");

        while (i < input.length) {

            enc1 = this._keyStr.indexOf(input.charAt(i++));
            enc2 = this._keyStr.indexOf(input.charAt(i++));
            enc3 = this._keyStr.indexOf(input.charAt(i++));
            enc4 = this._keyStr.indexOf(input.charAt(i++));

            chr1 = (enc1 << 2) | (enc2 >> 4);
            chr2 = ((enc2 & 15) << 4) | (enc3 >> 2);
            chr3 = ((enc3 & 3) << 6) | enc4;

            output = output + String.fromCharCode(chr1);

            if (enc3 != 64) {
                output = output + String.fromCharCode(chr2);
            }
            if (enc4 != 64) {
                output = output + String.fromCharCode(chr3);
            }

        }

        output = Base64._utf8_decode(output);

        return output;

    },

    // private method for UTF-8 encoding
    _utf8_encode: function(string) {
        string = string.replace(/\r\n/g, "\n");
        var utftext = "";

        for (var n = 0; n < string.length; n++) {

            var c = string.charCodeAt(n);

            if (c < 128) {
                utftext += String.fromCharCode(c);
            } else if ((c > 127) && (c < 2048)) {
                utftext += String.fromCharCode((c >> 6) | 192);
                utftext += String.fromCharCode((c & 63) | 128);
            } else {
                utftext += String.fromCharCode((c >> 12) | 224);
                utftext += String.fromCharCode(((c >> 6) & 63) | 128);
                utftext += String.fromCharCode((c & 63) | 128);
            }

        }

        return utftext;
    },

    // private method for UTF-8 decoding
    _utf8_decode: function(utftext) {
        var string = "";
        var i = 0;
        var c = c1 = c2 = 0;

        while (i < utftext.length) {

            c = utftext.charCodeAt(i);

            if (c < 128) {
                string += String.fromCharCode(c);
                i++;
            } else if ((c > 191) && (c < 224)) {
                c2 = utftext.charCodeAt(i + 1);
                string += String.fromCharCode(((c & 31) << 6) | (c2 & 63));
                i += 2;
            } else {
                c2 = utftext.charCodeAt(i + 1);
                c3 = utftext.charCodeAt(i + 2);
                string += String.fromCharCode(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
                i += 3;
            }

        }

        return string;
    }

};



这个是js版本,由于央行征信报告页面是拼接的,css取值不能一步到位,还必须再用正则细取,再者页面元素没有很好的标记,所以js版也是正则为主。js的match对应py的search,js的macth加g模式对应py的findall。js的search是返回个数字,要先弄清楚py和js的正则api的区别。

2、另外,使用js版本的用法是,要配合app的webview。在f12的console栏里面可以调试测试js,但那不是自动化的,尤其在多个页面跳转情况下,使用webview注入js非常方便。也可以直接在app端用httpclient对淘宝网发请求,但是征信类的项目,一般需要先登录,不依赖webview直接使用httpcliet请求淘宝登录接口的方式大批量登录任意明文的账号 密码,而不是复制ua cookie什么的(复制没什么鸟用,用户根本不知道cookie是什么,更没不用说ua参数是什么了),搞定它是天方夜谭,我没见过任何人搞定过,难度指数是五颗星。有兴趣的可以试试,不要只是嘴炮说抓包模拟就完了这么简单。