SpringBoot 集成爬虫框架WebMagic
你猜
阅读:640
2021-03-31 16:56:00
评论:0
任务要求:爬取百度新闻相关信息页面。
核心pom.xml 文件配置如下:
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.1.RELEASE</version>
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
<mybatis-spring-boot-starter.version>1.3.2</mybatis-spring-boot-starter.version>
<mysql-connector-java.version>8.0.11</mysql-connector-java.version>
<com.alibaba.druid.version>1.1.9</com.alibaba.druid.version>
<commons-lang.version>2.6</commons-lang.version>
<commons-codec.version>1.10</commons-codec.version>
<commons-lang3.version>3.8.1</commons-lang3.version>
<commons-net.version>3.6</commons-net.version>
<commons-io.version>2.6</commons-io.version>
<commons-collections.version>3.2.1</commons-collections.version>
<common-fileupload.version>1.3.1</common-fileupload.version>
<fastjson.version>1.2.48</fastjson.version>
<jasperreports.version>6.10.0</jasperreports.version>
</properties>
<dependencies>
<!-- SpringWeb模块 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<!-- 移除springboot 自带日志框架log-back -->
<!--
<exclusions>
<exclusion>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-logging</artifactId>
</exclusion>
</exclusions> -->
</dependency>
<!--springboot 集成测试框架 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<!--lombok插件 -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${lombok.version}</version>
<scope>provided</scope>
</dependency>
<!-- mysql 连接 -->
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>${mybatis-spring-boot-starter.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql-connector-java.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid-spring-boot-starter</artifactId>
<version>${com.alibaba.druid.version}</version>
</dependency>
<!-- 分页控件 -->
<dependency>
<groupId>com.github.pagehelper</groupId>
<artifactId>pagehelper</artifactId>
<version>4.1.6</version>
</dependency>
<!--common-lang 常用工具包 -->
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>${commons-lang.version}</version>
</dependency>
<!--commons-lang3 工具包 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>${commons-lang3.version}</version>
</dependency>
<!--commons-codec 加密工具包 -->
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>${commons-codec.version}</version>
</dependency>
<!--commons-net 网络工具包 -->
<dependency>
<groupId>commons-net</groupId>
<artifactId>commons-net</artifactId>
<version>${commons-net.version}</version>
</dependency>
<!--common-io 工具包 -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons-io.version}</version>
</dependency>
<!--common-collection 工具包 -->
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>${commons-collections.version}</version>
</dependency>
<!--common-fileupload 工具包 -->
<dependency>
<groupId>commons-fileupload</groupId>
<artifactId>commons-fileupload</artifactId>
<version>${common-fileupload.version}</version>
</dependency>
<!-- Swagger2 -->
<dependency>
<groupId>io.springfox</groupId>
<artifactId>springfox-swagger2</artifactId>
<version>2.7.0</version>
</dependency>
<dependency>
<groupId>io.springfox</groupId>
<artifactId>springfox-swagger-ui</artifactId>
<version>2.7.0</version>
</dependency>
<!-- fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${fastjson.version}</version>
</dependency>
<!-- 爬虫框架集成 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.2</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.2</version>
</dependency>
配置实例化对象:
package com.zzg.reptile.config;
import java.util.HashMap;
import java.util.Map;
import org.springframework.boot.web.servlet.FilterRegistrationBean;
import org.springframework.boot.web.servlet.ServletRegistrationBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import com.alibaba.druid.support.http.StatViewServlet;
import com.alibaba.druid.support.http.WebStatFilter;
/**
* druid 监控配置
* @author zzg
*
*/
@Configuration
public class DruidConfig {
@Bean
public ServletRegistrationBean druidServletRegistrationBean() {
ServletRegistrationBean servletRegistrationBean = new ServletRegistrationBean();
servletRegistrationBean.setServlet(new StatViewServlet());
servletRegistrationBean.addUrlMappings("/druid/*");
servletRegistrationBean.addInitParameter("allow", "");
servletRegistrationBean.addInitParameter("deny", "");
servletRegistrationBean.addInitParameter("loginUsername", "admin");
servletRegistrationBean.addInitParameter("loginPassword", "admin");
return servletRegistrationBean;
}
/**
* 注册DruidFilter拦截
*
* @return
*/
@Bean
public FilterRegistrationBean duridFilterRegistrationBean() {
FilterRegistrationBean filterRegistrationBean = new FilterRegistrationBean();
filterRegistrationBean.setFilter(new WebStatFilter());
Map<String, String> initParams = new HashMap<String, String>();
//设置忽略请求
initParams.put("exclusions", "*.js,*.gif,*.jpg,*.bmp,*.png,*.css,*.ico,/druid/*");
filterRegistrationBean.setInitParameters(initParams);
filterRegistrationBean.addUrlPatterns("/*");
return filterRegistrationBean;
}
}
package com.zzg.reptile.config;
import java.util.Properties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import com.github.pagehelper.PageHelper;
/**
* mybatis 配置对象
* @author zzg
*
*/
@Configuration
public class MyBatisConfig {
/**
* 分页对象实列化
* @return
*/
@Bean
public PageHelper pageHelper() {
PageHelper pageHelper = new PageHelper();
Properties p = new Properties();
p.setProperty("offsetAsPageNum", "true");
p.setProperty("rowBoundsWithCount", "true");
p.setProperty("reasonable", "true");
p.setProperty("dialect", "mysql");
pageHelper.setProperties(p);
return pageHelper;
}
}
package com.zzg.reptile.config;
import java.util.ArrayList;
import java.util.List;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import io.swagger.annotations.ApiOperation;
import springfox.documentation.builders.ApiInfoBuilder;
import springfox.documentation.builders.ParameterBuilder;
import springfox.documentation.builders.PathSelectors;
import springfox.documentation.builders.RequestHandlerSelectors;
import springfox.documentation.schema.ModelRef;
import springfox.documentation.service.ApiInfo;
import springfox.documentation.service.Contact;
import springfox.documentation.service.Parameter;
import springfox.documentation.spi.DocumentationType;
import springfox.documentation.spring.web.plugins.Docket;
import springfox.documentation.swagger2.annotations.EnableSwagger2;
@Configuration
@EnableSwagger2
public class SwaggerConfig {
@Bean
public Docket buildDocket() {
ParameterBuilder tokenPar = new ParameterBuilder();
List<Parameter> pars = new ArrayList<Parameter>();
tokenPar.name("X-CSRF-TOKEN").description("令牌").modelRef(new ModelRef("string")).parameterType("header")
.required(false).build();
pars.add(tokenPar.build());
return new Docket(DocumentationType.SWAGGER_2).select()
.apis(RequestHandlerSelectors.withMethodAnnotation(ApiOperation.class)).paths(PathSelectors.any())
.build().globalOperationParameters(pars).apiInfo(buildApiInf());
}
private ApiInfo buildApiInf() {
return new ApiInfoBuilder().title("****").termsOfServiceUrl("http://www.baidu.cn/")
.description("API接口")
.contact(new Contact("baidu", "http://www.baidu.cn/", "zhouzhiwengang@163.com"))
.version("2.0").build();
}
}
业务逻辑实体对象(model、Mapper、service、serviceImpl)省略。
application.properties
# æå®æå¡ç«¯å£
server.port=7090
# æå®æå¡ å称
# server.context-path=/jreport
#mybatis xml æ件éç½®
mybatis.mapper-locations=classpath*:mapper/reptile/*Mapper.xml
mybatis.type-aliases-package=com.zzg.reptile.domain
# MyBatis mysql8 éç½®
spring.datasource.url=jdbc:mysql://192.168.1.73:3306/boot-security?serverTimezone=UTC&useSSL=false&allowPublicKeyRetrieval=true&allowMultiQueries=true&nullCatalogMeansCurrent=true
spring.datasource.username=root
spring.datasource.password=digipower
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
# Druid éç½®
# åå§åæ¶å»ºç«ç©çè¿æ¥ç个æ°
spring.datasource.druid.initial-size=5
# æ大è¿æ¥æ± æ°é
spring.datasource.druid.max-active=30
# æå°è¿æ¥æ± æ°é
spring.datasource.druid.min-idle=5
# è·åè¿æ¥æ¶æ大çå¾æ¶é´ï¼åä½æ¯«ç§
spring.datasource.druid.max-wait=60000
# éç½®é´éå¤ä¹æè¿è¡ä¸æ¬¡æ£æµï¼æ£æµéè¦å³éç空é²è¿æ¥ï¼åä½æ¯æ¯«ç§
spring.datasource.druid.time-between-eviction-runs-millis=60000
# è¿æ¥ä¿æ空é²èä¸è¢«é©±éçæå°æ¶é´
spring.datasource.druid.min-evictable-idle-time-millis=300000
# ç¨æ¥æ£æµè¿æ¥æ¯å¦ææçsqlï¼è¦æ±æ¯ä¸ä¸ªæ¥è¯¢è¯å¥
spring.datasource.druid.validation-query=SELECT 1 FROM DUAL
# 建议é置为trueï¼ä¸å½±åæ§è½ï¼å¹¶ä¸ä¿è¯å®å¨æ§ãç³è¯·è¿æ¥çæ¶åæ£æµï¼å¦æ空é²æ¶é´å¤§äºtimeBetweenEvictionRunsMillisï¼æ§è¡validationQueryæ£æµè¿æ¥æ¯å¦ææã
spring.datasource.druid.test-while-idle=true
# ç³è¯·è¿æ¥æ¶æ§è¡validationQueryæ£æµè¿æ¥æ¯å¦ææï¼åäºè¿ä¸ªéç½®ä¼éä½æ§è½ã
spring.datasource.druid.test-on-borrow=false
# å½è¿è¿æ¥æ¶æ§è¡validationQueryæ£æµè¿æ¥æ¯å¦ææï¼åäºè¿ä¸ªéç½®ä¼éä½æ§è½ã
spring.datasource.druid.test-on-return=false
# æ¯å¦ç¼åpreparedStatementï¼ä¹å°±æ¯PSCacheãPSCache对æ¯æ游æ çæ°æ®åºæ§è½æå巨大ï¼æ¯å¦è¯´oracleãå¨mysqlä¸å»ºè®®å³éã
spring.datasource.druid.pool-prepared-statements=true
# è¦å¯ç¨PSCacheï¼å¿é¡»é置大äº0ï¼å½å¤§äº0æ¶ï¼poolPreparedStatementsèªå¨è§¦åä¿®æ¹ä¸ºtrueã
spring.datasource.druid.max-pool-prepared-statement-per-connection-size=50
# éç½®çæ§ç»è®¡æ¦æªçfiltersï¼å»æåçæ§çé¢sqlæ æ³ç»è®¡
#spring.datasource.druid.filters=stat,wall
# éè¿connectPropertieså±æ§æ¥æå¼mergeSqlåè½ï¼æ¢SQLè®°å½
spring.datasource.druid.connection-properties=druid.stat.mergeSql=true;druid.stat.slowSqlMillis=500
# å并å¤ä¸ªDruidDataSourceççæ§æ°æ®
spring.datasource.druid.use-global-data-source-stat=true
# éç½®sql 注å¥æ¹å¼
spring.datasource.druid.filters=stat
# æ¥å¿æ件éç½®
logging.config=classpath:logback.xml
爬虫核心业务组件编写:
package com.zzg.reptile.component;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.springframework.stereotype.Component;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
/**
* 百度新闻#国内焦点
* @author zzg
*
*/
@Component
public class BaiduNewPageProcessor implements PageProcessor {
//解析数据
private JSONArray array = new JSONArray();
public JSONArray getArray() {
return array;
}
public void setArray(JSONArray array) {
this.array = array;
}
// 站点信息
String domain ="news.baidu.com";
Integer sleepTime = 1000;
Integer retryTime = 30;
String charset ="utf-8";
Integer timeOut = 30000;
String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36";
//
// 初始化站点
private Site site = Site.me().setDomain(domain).setSleepTime(sleepTime)
.setRetryTimes(retryTime).setCharset(charset).setTimeOut(timeOut).setUserAgent(userAgent);
@Override
public void process(Page page) {
// TODO Auto-generated method stub
Html html = page.getHtml();
if(html != null){
List<Selectable> selectables = html.xpath("//div[@class='hotnews']/ul/li/strong/").nodes();
for(int i = 0; i < selectables.size(); i ++){
Selectable selectable = selectables.get(i);
String content = selectable.toString();
Pattern pattern = Pattern.compile("^<i");
Matcher matcher = pattern.matcher(content);
if(matcher.find()){
selectables.remove(i);
}
}
for(int i =0; i < selectables.size(); i ++){
// 解析数据接收
JSONObject jsonObject = new JSONObject();
Selectable selectable = selectables.get(i);
String url = selectable.links().toString();
System.out.println("url is:" + url);
// 设置访问url
jsonObject.put("url", url);
String context = selectable.toString();
Pattern pattern = Pattern.compile(".*(<b>).*");
Matcher matcher = pattern.matcher(context);
if(matcher.find()){
String title = selectable.xpath("/a/b/text()").get();
// 设置访问标题
jsonObject.put("title", title);
} else {
String title = selectable.xpath("a/text()").get();
// 设置访问标题
jsonObject.put("title", title);
}
array.add(jsonObject);
}
}
}
@Override
public Site getSite() {
// TODO Auto-generated method stub
return site;
}
}
爬虫功能测试:
package com.zzg.reptile.controller;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.ResponseBody;
import com.alibaba.fastjson.JSONArray;
import com.zzg.jreport.response.JreportResponse;
import com.zzg.reptile.component.BaiduNewPageProcessor;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import us.codecraft.webmagic.Spider;
@Controller
@RequestMapping("/api/webmagic")
@Api(value = "爬虫Controlle", tags = "爬虫操作服务")
public class WebmagicController {
@Autowired
private BaiduNewPageProcessor processor;
@ApiOperation(httpMethod = "POST", value = "新闻信息爬取")
@RequestMapping(value = "/news", method = { RequestMethod.POST }, produces = "application/json;charset=UTF-8")
@ResponseBody
public JreportResponse news() {
String url = "https://news.baidu.com/?cmd=1&class=civilnews&tn=rss&sub=0";
Spider.create(processor).addUrl(url).run();
JSONArray array = processor.getArray();
return JreportResponse.ok(array);
}
}
项目结构:
声明
1.本站遵循行业规范,任何转载的稿件都会明确标注作者和来源;2.本站的原创文章,请转载时务必注明文章作者和来源,不尊重原创的行为我们将追究责任;3.作者投稿可能会经我们编辑修改或补充。