XML格式文档解析

背景

大数据平台项目中要对中国省级行政区名称定制一个清洗规则,规则中希望,如果有不正规的省份名称,需要进行改进,对数据进行修正,比如 香港 -> 香港特别行政区, 宁夏-> 宁夏回族自治区, 所以将所有标准省份名称放入xml文件中,并对xml文件进行解析,读取到内存中,所以特此记录一下

code


import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;


import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import parquet.org.slf4j.Logger;
import parquet.org.slf4j.LoggerFactory;

/**
* 用DOM方式读取xml文件
* @author wangyg
*/
public class ReadxmlByDom {
private static final Logger LOG = LoggerFactory.getLogger(ReadxmlByDom.class);

private static DocumentBuilderFactory dbFactory = null;
private static DocumentBuilder db = null;
private static Document document = null;
private static List<Province> provinceList = null;
static{
try {
dbFactory = DocumentBuilderFactory.newInstance();
db = dbFactory.newDocumentBuilder();
} catch (ParserConfigurationException e) {
e.printStackTrace();
}
}

public static List<Province> getBooks(String fileName) throws Exception{
//将给定 URI 的内容解析为一个 XML 文档,并返回Document对象
InputStream in = ReadxmlByDom.class.getClassLoader().getResourceAsStream(fileName);
if(in == null){
LOG.error("in is null ...");
return null;
}
document = db.parse(in);
//按文档顺序返回包含在文档中且具有给定标记名称的所有 Element 的 NodeList
NodeList bookList = document.getElementsByTagName("Province");
provinceList = new ArrayList<>();
//遍历books
for(int i=0;i<bookList.getLength();i++){
Province pro = new Province();
//获取第i个book结点
org.w3c.dom.Node node = bookList.item(i);
//获取第i个book的所有属性
NamedNodeMap namedNodeMap = node.getAttributes();
//获取已知名为id的属性值
String id = namedNodeMap.getNamedItem("ID").getTextContent();
pro.setID(Integer.parseInt(id));
//获取book结点的子节点,包含了Test类型的换行
NodeList cList = node.getChildNodes();

Node item = cList.item(0);
pro.setProvinceName(item.getNodeValue());
provinceList.add(pro);
}

return provinceList;

}

public static void main(String args[]){
String fileName = "Provinces.xml";
try {
List<Province> list = ReadxmlByDom.getBooks(fileName);
for(Province pro :list){
System.out.println(pro);
}
} catch (Exception e) {
e.printStackTrace();
}
}

}

bean类

package com.qjzh.etown.bdp.util;

public class Province {
private int ID;
private String ProvinceName;


public int getID() {
return ID;
}

public void setID(int ID) {
this.ID = ID;
}

public String getProvinceName() {
return ProvinceName;
}

public void setProvinceName(String provinceName) {
ProvinceName = provinceName;
}

@Override
public String toString() {
return "Province{" +
"ID=" + ID +
", ProvinceName='" + ProvinceName + '\'' +
'}';
}
}

遇到的问题

xml文件找不到

将xml文件放在resources目录下,未找到,不能访问

解决:查看源码中的接口,可以使用inputStream, 所以就用下面这种方式处理

//filename就是xml文件名称
InputStream in = ReadxmlByDom.class.getClassLoader().getResourceAsStream(fileName);
if(in == null){
LOG.error("in is null ...");
return null;
}
document = db.parse(in);