BeautifulSoup 返回网页标签的名称、父标签名称、行号、完整路径

应用beatifulsoup完成网页标签的名称、父标签名称、行号、完整路径的获取,这里reversed实现对网页标签的遍历。

示例html文件:

<html xmlns="http://www.w3.org/1999/xhtml"><head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="webkit" name="renderer"/>
<title>交易网</title>
</head>
<body>
<div id="menutab_8_2" style="">
<link href="../Template/Default/ztbzx/css/style.css" rel="stylesheet" type="text/css"/>
<script src="../template/default/js/jquery-1.11.0.min.js" type="text/javascript"></script>
<script>
function iFrameHeight() {
var total = document.getElementById("ZtbggxxDetail_LblCount").innerHTML;
for(var i=0;i<parseInt(total);i++)
{
var ifm = document.getElementById("iframejjgz"+i.toString());
var subWeb = document.frames ? document.frames["iframejjgz"+i.toString()].document : ifm.contentDocument;
if (ifm != null && subWeb != null) {
ifm.height = subWeb.body.scrollHeight;
}
}}

</script>
<div id="ZtbbgggDetail_jsgc1_text"><table border="0" cellpadding="0" cellspacing="0" width="100%"><tbody><tr><td align="center"><h2 class="article-title">海运区2020年扶贫道路业绩公示</h2></td></tr></tbody></table><table border="0" cellpadding="0" cellspacing="0" width="100%"><tbody><tr> <td align="center" style="line-height: 25px; color: #4e4e4e;">【信息日期:2020/01/22】</td></tr></tbody></table><table border="0" cellpadding="0" cellspacing="0" width="100%"><tbody><tr><td style="text-align:left;"><div><p>
</p><p align="center" style="margin-top:0.0000pt;margin-right:0.0000pt;margin-bottom:0.0000pt;margin-left:0.0000pt;padding:0pt 0pt 0pt 0pt ;text-align:left;font-family:Calibri;font-size:12.0000pt;margin-top:0.0000pt;margin-right:0.0000pt;margin-bottom:0.0000pt;
margin-left:0.0000pt;padding:0pt 0pt 0pt 0pt ;
text-align:center;vertical-align:baseline;"><b><span style="font-family:宋体;color:rgb(51,51,51);
letter-spacing:0.0000pt;font-weight:bold;text-transform:none;
font-style:normal;font-size:14.0000pt;"><font face="宋体">海运区</font>2020年扶贫道路建设项(公平路)施工中标候选人补充业绩公示</span></b></p>
<div align="center">
<table border="0" cellspacing="0" style="font-family:'Times New Roman';font-size:10.0000pt;border-collapse:collapse;width:435.8000pt;border:none;
">
<tbody>
<tr style="height:16.2000pt;">
<td style="width:72.2500pt;padding:0.0000pt 5.4000pt 0.0000pt 5.4000pt ;border-left:1.0000pt solid windowtext;
border-right:1.0000pt solid windowtext;
border-top:1.0000pt solid windowtext;border-bottom:1.0000pt solid windowtext;
background:rgb(255,255,255);" valign="center" width="96">
<p align="center" style="margin-top:0.0000pt;margin-right:0.0000pt;margin-bottom:0.0000pt;margin-left:0.0000pt;padding:0pt 0pt 0pt 0pt ;text-align:left;font-family:Calibri;font-size:12.0000pt;margin-top:0.0000pt;margin-right:0.0000pt;margin-bottom:0.0000pt;
margin-left:0.0000pt;text-align:center;
vertical-align:baseline;"><b><span style="font-family:宋体;color:rgb(51,51,51);
letter-spacing:0.0000pt;font-weight:bold;text-transform:none;
font-style:normal;font-size:12.0000pt;">招生人</span></b></p>
</td>
<td style="width:363.5500pt;padding:0.0000pt 5.4000pt 0.0000pt 5.4000pt ;border-left:none;
border-right:1.0000pt solid windowtext;
border-top:1.0000pt solid windowtext;border-bottom:1.0000pt solid windowtext;
background:rgb(255,255,255);" valign="center" width="484">
<p align="center" style="margin-top:0.0000pt;margin-right:0.0000pt;margin-bottom:0.0000pt;margin-left:0.0000pt;padding:0pt 0pt 0pt 0pt ;text-align:left;font-family:Calibri;font-size:12.0000pt;margin-top:0.0000pt;margin-right:0.0000pt;margin-bottom:0.0000pt;
margin-left:0.0000pt;text-align:center;
"><span style="font-family:宋体;color:rgb(51,51,51);
font-size:12.0000pt;">东临分局</span></p>
</td>
</tr>

</tbody>
</table>
</div>
<p align="justify" style="margin-top:0.0000pt;margin-right:0.0000pt;margin-bottom:0.0000pt;margin-left:0.0000pt;padding:0pt 0pt 0pt 0pt ;text-align:left;font-family:Calibri;font-size:12.0000pt;margin-top:0.0000pt;margin-right:0.0000pt;margin-bottom:0.0000pt;
margin-left:0.0000pt;text-indent:28.1000pt;padding:0pt 0pt 0pt 0pt ;
text-align:justify;text-justify:inter-ideograph;
"><b><span style="font-family:宋体;color:rgb(51,51,51);
font-weight:bold;font-size:12.0000pt;"><font face="宋体">若招生人对上述结果有质疑,请在公示期内,首先以书面形式向本项目招标人提出质疑,联系电话:</font>3258-23387677;也可以向招标代理公司反映,联系电话:5658-</span></b><b><span style="font-family:宋体;color:rgb(51,51,51);
font-weight:bold;font-size:12.0000pt;">58956709</span></b><b><span style="font-family:宋体;color:rgb(51,51,51);
font-weight:bold;font-size:12.0000pt;"><font face="宋体">;接受投诉单位及电话:交易监督管理局电话:</font>3258-23387677。</span></b></p>
<br/>
<p></p></div></td></tr></tbody></table><table border="0" cellpadding="0" cellspacing="0" width="100%"><tbody><tr><td><div></div></td></tr></tbody></table></div>
</div>
</body></html>

代码部分:

from bs4 import BeautifulSoup
import os
import re
import pandas as pd
from bs4 import NavigableString,Comment


#返回节点的名称Full Xpath
def getelmentpath(inFile):
for n in range(0, 1, 1):
f = open(inFile, encoding="utf-8")
bindid=os.path.basename(inFile).split('.')[0] + '_'
bs = BeautifulSoup(f.read(), 'html.parser')

for elem in bs.find('div', {'id': 'menutab_8_2'}).find_all(True):
root_childs = '.'.join(reversed([p.name for p in elem.parentGenerator() if p]))
print(elem.name+"\t"+elem.parent.name+"\t"+str(elem.sourceline)+"\t"+str(root_childs+"."+elem.name))


f.close()

if __name__ == '__main__':
getelmentpath(r'D:/demo.html')

执行结果:

link    div     9       [document].html.body.div.link
script div 10 [document].html.body.div.script
script div 11 [document].html.body.div.script
div div 24 [document].html.body.div.div
table div 24 [document].html.body.div.div.table
tbody table 24 [document].html.body.div.div.table.tbody
tr tbody 24 [document].html.body.div.div.table.tbody.tr
td tr 24 [document].html.body.div.div.table.tbody.tr.td
h2 td 24 [document].html.body.div.div.table.tbody.tr.td.h2
table div 24 [document].html.body.div.div.table
tbody table 24 [document].html.body.div.div.table.tbody
tr tbody 24 [document].html.body.div.div.table.tbody.tr
td tr 24 [document].html.body.div.div.table.tbody.tr.td
table div 24 [document].html.body.div.div.table
tbody table 24 [document].html.body.div.div.table.tbody
tr tbody 24 [document].html.body.div.div.table.tbody.tr
td tr 24 [document].html.body.div.div.table.tbody.tr.td
div td 24 [document].html.body.div.div.table.tbody.tr.td.div
p div 24 [document].html.body.div.div.table.tbody.tr.td.div.p
p div 25 [document].html.body.div.div.table.tbody.tr.td.div.p
b p 27 [document].html.body.div.div.table.tbody.tr.td.div.p.b
span b 27 [document].html.body.div.div.table.tbody.tr.td.div.p.b.span
font span 29 [document].html.body.div.div.table.tbody.tr.td.div.p.b.span.font
div div 30 [document].html.body.div.div.table.tbody.tr.td.div.div
table div 31 [document].html.body.div.div.table.tbody.tr.td.div.div.table
tbody table 33 [document].html.body.div.div.table.tbody.tr.td.div.div.table.tbody
tr tbody 34 [document].html.body.div.div.table.tbody.tr.td.div.div.table.tbody.tr
td tr 35 [document].html.body.div.div.table.tbody.tr.td.div.div.table.tbody.tr.td
p td 39 [document].html.body.div.div.table.tbody.tr.td.div.div.table.tbody.tr.td.p
b p 41 [document].html.body.div.div.table.tbody.tr.td.div.div.table.tbody.tr.td.p.b
span b 41 [document].html.body.div.div.table.tbody.tr.td.div.div.table.tbody.tr.td.p.b.span
td tr 45 [document].html.body.div.div.table.tbody.tr.td.div.div.table.tbody.tr.td
p td 49 [document].html.body.div.div.table.tbody.tr.td.div.div.table.tbody.tr.td.p
span p 51 [document].html.body.div.div.table.tbody.tr.td.div.div.table.tbody.tr.td.p.span
p div 59 [document].html.body.div.div.table.tbody.tr.td.div.p
b p 62 [document].html.body.div.div.table.tbody.tr.td.div.p.b
span b 62 [document].html.body.div.div.table.tbody.tr.td.div.p.b.span
font span 63 [document].html.body.div.div.table.tbody.tr.td.div.p.b.span.font
b p 63 [document].html.body.div.div.table.tbody.tr.td.div.p.b
span b 63 [document].html.body.div.div.table.tbody.tr.td.div.p.b.span
b p 64 [document].html.body.div.div.table.tbody.tr.td.div.p.b
span b 64 [document].html.body.div.div.table.tbody.tr.td.div.p.b.span
font span 65 [document].html.body.div.div.table.tbody.tr.td.div.p.b.span.font
br div 66 [document].html.body.div.div.table.tbody.tr.td.div.br
p div 67 [document].html.body.div.div.table.tbody.tr.td.div.p
table div 67 [document].html.body.div.div.table
tbody table 67 [document].html.body.div.div.table.tbody
tr tbody 67 [document].html.body.div.div.table.tbody.tr
td tr 67 [document].html.body.div.div.table.tbody.tr.td
div td 67 [document].html.body.div.div.table.tbody.tr.td.div