有这样一段HTML:
<div><table><td id='1234 foo 5678'>Hello</td>
希望通过这个XPath提取出Hello:
//div//td[contains(@id, 'foo')]/text()
先导入maven依赖:
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlcleaner/htmlcleaner -->
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.21</version>
</dependency>
main函数:
package com.my.demo;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
public class HtmlXpathJava {
public static void main(String[] args) {
String sampleHtml = "<div><table><td id='1234 foo 5678'>Hello</td>";
String sampleXpath = "//div//td[contains(@id, 'foo')]/text()";
System.out.println(getValueByXpath(sampleXpath, sampleHtml));
}
/**
* Extract value by xPath from HTML.
*/
private static String getValueByXpath(String xPath, String html) {
TagNode tagNode = new HtmlCleaner().clean(html);
String value = null;
try {
Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
XPath xpath = XPathFactory.newInstance().newXPath();
value = (String) xpath.evaluate(xPath, doc, XPathConstants.STRING);
} catch (Exception e) {
System.out.println("Extract value error. " + e.getMessage());
e.printStackTrace();
}
return value;
}
}
输出:
Hello
参考:
https://stackoverflow.com/questions/9022140/using-xpath-contains-against-html-in-java