java实现爬虫常用的第三方包:
- httpclient,for http
- jsoup,for dom
- rhino,for js
- jackson,for json
pom.xml摘录
<dependencies>
<!-- simulate web browser -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.7</version>
</dependency>
<!-- parse DOM -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!-- jackson -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.9.8</version>
</dependency>
<!-- parse javascript -->
<dependency>
<groupId>org.mozilla</groupId>
<artifactId>rhino</artifactId>
<version>1.7.10</version>
</dependency>
<!-- simulate client action -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.33</version>
</dependency>
<!-- upgrade junit to junit4 -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12<!-- default is v3.8.1 --></version>
<scope>test</scope>
</dependency>
<!-- log -->
<!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-api -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.25</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
<!-- <scope>test</scope> -->
</dependency>
</dependencies>
启用log4j基本配置,在main方法中加入语句:
public static void main(String[] args) {
//启用log4j基本配置
//不想去写配置文件,可以用Java基本配置
BasicConfigurator.configure();
//...
}