Microsoft的XML大师Chris Lovett发布了一个新的SGML解析器(应该是2008年的版本),叫做SgmlReader(早期的SgmlReader在2006年以前就出了),它可以解析HTML文件,甚至将它们转换成一个格式规范的结构。SgmlReader派生于XmlReader,这就是说,你可以像运用诸如XmlTextReader这样的类来解析XML文件那样来解析HTML文件。
- private string GetWellFormedHTMLFile(string filePath, string xpath)
- {
- StreamReader sReader = null;
- StringWriter sw = null;
- SgmlReader reader = null;
- XmlTextWriter writer = null;
- try
- {
- sReader = new StreamReader(filePath);
- reader = new SgmlReader();
- reader.DocType = "HTML";
- reader.InputStream = new StringReader(sReader.ReadToEnd());
- sw = new StringWriter();
- writer = new XmlTextWriter(sw);
- writer.Formatting = Formatting.Indented;
- //writer.WriteStartElement("Test");
- while (reader.Read())
- {
- if (reader.NodeType != XmlNodeType.Whitespace)
- {
- writer.WriteNode(reader, true);
- }
- }
- //writer.WriteEndElement();
- if (xpath == null)
- {
- return sw.ToString();
- }
- else
- { //Filter out nodes from HTML
- StringBuilder sb = new StringBuilder();
- XPathDocument doc = new XPathDocument(new StringReader(sw.ToString()));
- XPathNavigator nav = doc.CreateNavigator();
- XPathNodeIterator nodes = nav.Select(xpath);
- while (nodes.MoveNext())
- {
- sb.Append(nodes.Current.Value + ((char)13).ToString());
- }
- return sb.ToString();
- }
- }
- catch (Exception exp)
- {
- writer.Close();
- reader.Close();
- sw.Close();
- sReader.Close();
- return exp.Message;
- }
- }