SpringBoot 集成爬虫框架WebMagic

你猜 阅读:640 2021-03-31 16:56:00 评论:0

任务要求:爬取百度新闻相关信息页面。

核心pom.xml 文件配置如下:

    <parent> 
		<groupId>org.springframework.boot</groupId> 
		<artifactId>spring-boot-starter-parent</artifactId> 
		<version>2.1.1.RELEASE</version> 
	</parent> 
 
	<properties> 
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> 
		<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> 
		<java.version>1.8</java.version> 
		<mybatis-spring-boot-starter.version>1.3.2</mybatis-spring-boot-starter.version> 
		<mysql-connector-java.version>8.0.11</mysql-connector-java.version> 
		<com.alibaba.druid.version>1.1.9</com.alibaba.druid.version> 
		<commons-lang.version>2.6</commons-lang.version> 
		<commons-codec.version>1.10</commons-codec.version> 
		<commons-lang3.version>3.8.1</commons-lang3.version> 
		<commons-net.version>3.6</commons-net.version> 
		<commons-io.version>2.6</commons-io.version> 
		<commons-collections.version>3.2.1</commons-collections.version> 
		<common-fileupload.version>1.3.1</common-fileupload.version> 
		<fastjson.version>1.2.48</fastjson.version> 
		<jasperreports.version>6.10.0</jasperreports.version> 
	</properties> 
 
 
	<dependencies> 
		<!-- SpringWeb模块 --> 
		<dependency> 
			<groupId>org.springframework.boot</groupId> 
			<artifactId>spring-boot-starter-web</artifactId> 
			<!-- 移除springboot 自带日志框架log-back  --> 
			<!-- 
			 <exclusions> 
                <exclusion> 
                    <groupId>org.springframework.boot</groupId> 
                    <artifactId>spring-boot-starter-logging</artifactId> 
                </exclusion> 
            </exclusions>  --> 
		</dependency> 
 
		<!--springboot 集成测试框架 --> 
		<dependency> 
			<groupId>org.springframework.boot</groupId> 
			<artifactId>spring-boot-starter-test</artifactId> 
			<scope>test</scope> 
		</dependency> 
 
 
		<!--lombok插件 --> 
		<dependency> 
			<groupId>org.projectlombok</groupId> 
			<artifactId>lombok</artifactId> 
			<version>${lombok.version}</version> 
			<scope>provided</scope> 
		</dependency> 
 
 
		<!-- mysql 连接 --> 
		<dependency> 
			<groupId>org.mybatis.spring.boot</groupId> 
			<artifactId>mybatis-spring-boot-starter</artifactId> 
			<version>${mybatis-spring-boot-starter.version}</version> 
		</dependency> 
		<dependency> 
			<groupId>mysql</groupId> 
			<artifactId>mysql-connector-java</artifactId> 
			<version>${mysql-connector-java.version}</version> 
			<scope>runtime</scope> 
		</dependency> 
		<dependency> 
			<groupId>com.alibaba</groupId> 
			<artifactId>druid-spring-boot-starter</artifactId> 
			<version>${com.alibaba.druid.version}</version> 
		</dependency> 
		<!-- 分页控件 --> 
		<dependency> 
			<groupId>com.github.pagehelper</groupId> 
			<artifactId>pagehelper</artifactId> 
			<version>4.1.6</version> 
		</dependency> 
 
		<!--common-lang 常用工具包 --> 
		<dependency> 
			<groupId>commons-lang</groupId> 
			<artifactId>commons-lang</artifactId> 
			<version>${commons-lang.version}</version> 
		</dependency> 
		<!--commons-lang3 工具包 --> 
		<dependency> 
			<groupId>org.apache.commons</groupId> 
			<artifactId>commons-lang3</artifactId> 
			<version>${commons-lang3.version}</version> 
		</dependency> 
 
		<!--commons-codec 加密工具包 --> 
		<dependency> 
			<groupId>commons-codec</groupId> 
			<artifactId>commons-codec</artifactId> 
			<version>${commons-codec.version}</version> 
		</dependency> 
		<!--commons-net 网络工具包 --> 
		<dependency> 
			<groupId>commons-net</groupId> 
			<artifactId>commons-net</artifactId> 
			<version>${commons-net.version}</version> 
		</dependency> 
		<!--common-io 工具包 --> 
		<dependency> 
			<groupId>commons-io</groupId> 
			<artifactId>commons-io</artifactId> 
			<version>${commons-io.version}</version> 
		</dependency> 
		<!--common-collection 工具包 --> 
		<dependency> 
			<groupId>commons-collections</groupId> 
			<artifactId>commons-collections</artifactId> 
			<version>${commons-collections.version}</version> 
		</dependency> 
		<!--common-fileupload 工具包 --> 
		<dependency> 
			<groupId>commons-fileupload</groupId> 
			<artifactId>commons-fileupload</artifactId> 
			<version>${common-fileupload.version}</version> 
		</dependency> 
 
		<!-- Swagger2 --> 
		<dependency> 
			<groupId>io.springfox</groupId> 
			<artifactId>springfox-swagger2</artifactId> 
			<version>2.7.0</version> 
		</dependency> 
		<dependency> 
			<groupId>io.springfox</groupId> 
			<artifactId>springfox-swagger-ui</artifactId> 
			<version>2.7.0</version> 
		</dependency> 
 
		<!-- fastjson --> 
		<dependency> 
			<groupId>com.alibaba</groupId> 
			<artifactId>fastjson</artifactId> 
			<version>${fastjson.version}</version> 
		</dependency> 
 
        <!-- 爬虫框架集成 --> 
		<dependency> 
			<groupId>us.codecraft</groupId> 
			<artifactId>webmagic-core</artifactId> 
			<version>0.7.2</version> 
		</dependency> 
		<dependency> 
			<groupId>us.codecraft</groupId> 
			<artifactId>webmagic-extension</artifactId> 
			<version>0.7.2</version> 
		</dependency>

配置实例化对象:

package com.zzg.reptile.config; 
 
import java.util.HashMap; 
import java.util.Map; 
 
import org.springframework.boot.web.servlet.FilterRegistrationBean; 
import org.springframework.boot.web.servlet.ServletRegistrationBean; 
import org.springframework.context.annotation.Bean; 
import org.springframework.context.annotation.Configuration; 
 
import com.alibaba.druid.support.http.StatViewServlet; 
import com.alibaba.druid.support.http.WebStatFilter; 
/** 
 * druid 监控配置 
 * @author zzg 
 * 
 */ 
@Configuration 
public class DruidConfig { 
	 	@Bean 
	    public ServletRegistrationBean druidServletRegistrationBean() { 
	        ServletRegistrationBean servletRegistrationBean = new ServletRegistrationBean(); 
	        servletRegistrationBean.setServlet(new StatViewServlet()); 
	        servletRegistrationBean.addUrlMappings("/druid/*"); 
	        servletRegistrationBean.addInitParameter("allow", ""); 
	        servletRegistrationBean.addInitParameter("deny", ""); 
	        servletRegistrationBean.addInitParameter("loginUsername", "admin"); 
	        servletRegistrationBean.addInitParameter("loginPassword", "admin"); 
	        return servletRegistrationBean; 
	    } 
 
	    /** 
	     * 注册DruidFilter拦截 
	     * 
	     * @return 
	     */ 
	    @Bean 
	    public FilterRegistrationBean duridFilterRegistrationBean() { 
	        FilterRegistrationBean filterRegistrationBean = new FilterRegistrationBean(); 
	        filterRegistrationBean.setFilter(new WebStatFilter()); 
	        Map<String, String> initParams = new HashMap<String, String>(); 
	        //设置忽略请求 
	        initParams.put("exclusions", "*.js,*.gif,*.jpg,*.bmp,*.png,*.css,*.ico,/druid/*"); 
	        filterRegistrationBean.setInitParameters(initParams); 
	        filterRegistrationBean.addUrlPatterns("/*"); 
	        return filterRegistrationBean; 
	    } 
} 
package com.zzg.reptile.config; 
 
import java.util.Properties; 
 
import org.springframework.context.annotation.Bean; 
import org.springframework.context.annotation.Configuration; 
 
import com.github.pagehelper.PageHelper; 
 
/** 
 * mybatis 配置对象 
 * @author zzg 
 * 
 */ 
@Configuration 
public class MyBatisConfig { 
	/** 
	 * 分页对象实列化 
	 * @return 
	 */ 
	@Bean 
	public PageHelper pageHelper() { 
		PageHelper pageHelper = new PageHelper(); 
		Properties p = new Properties(); 
		p.setProperty("offsetAsPageNum", "true"); 
		p.setProperty("rowBoundsWithCount", "true"); 
		p.setProperty("reasonable", "true"); 
		p.setProperty("dialect", "mysql"); 
		pageHelper.setProperties(p); 
		return pageHelper; 
	} 
}
package com.zzg.reptile.config; 
 
import java.util.ArrayList; 
import java.util.List; 
 
import org.springframework.context.annotation.Bean; 
import org.springframework.context.annotation.Configuration; 
 
import io.swagger.annotations.ApiOperation; 
import springfox.documentation.builders.ApiInfoBuilder; 
import springfox.documentation.builders.ParameterBuilder; 
import springfox.documentation.builders.PathSelectors; 
import springfox.documentation.builders.RequestHandlerSelectors; 
import springfox.documentation.schema.ModelRef; 
import springfox.documentation.service.ApiInfo; 
import springfox.documentation.service.Contact; 
import springfox.documentation.service.Parameter; 
import springfox.documentation.spi.DocumentationType; 
import springfox.documentation.spring.web.plugins.Docket; 
import springfox.documentation.swagger2.annotations.EnableSwagger2; 
 
@Configuration 
@EnableSwagger2 
public class SwaggerConfig { 
	@Bean 
	public Docket buildDocket() { 
 
		ParameterBuilder tokenPar = new ParameterBuilder(); 
		List<Parameter> pars = new ArrayList<Parameter>(); 
		tokenPar.name("X-CSRF-TOKEN").description("令牌").modelRef(new ModelRef("string")).parameterType("header") 
				.required(false).build(); 
		pars.add(tokenPar.build()); 
 
		return new Docket(DocumentationType.SWAGGER_2).select() 
				.apis(RequestHandlerSelectors.withMethodAnnotation(ApiOperation.class)).paths(PathSelectors.any()) 
				.build().globalOperationParameters(pars).apiInfo(buildApiInf()); 
	} 
 
	private ApiInfo buildApiInf() { 
		return new ApiInfoBuilder().title("****").termsOfServiceUrl("http://www.baidu.cn/") 
				.description("API接口") 
				.contact(new Contact("baidu", "http://www.baidu.cn/", "zhouzhiwengang@163.com")) 
				.version("2.0").build(); 
 
	} 
} 

业务逻辑实体对象(model、Mapper、service、serviceImpl)省略。

application.properties 

# æå®æå¡ç«¯å£ 
server.port=7090 
# æå®æå¡ å称 
# server.context-path=/jreport 
#mybatis xml æ件éç½® 
mybatis.mapper-locations=classpath*:mapper/reptile/*Mapper.xml 
mybatis.type-aliases-package=com.zzg.reptile.domain 
# MyBatis mysql8 éç½® 
spring.datasource.url=jdbc:mysql://192.168.1.73:3306/boot-security?serverTimezone=UTC&useSSL=false&allowPublicKeyRetrieval=true&allowMultiQueries=true&nullCatalogMeansCurrent=true 
spring.datasource.username=root 
spring.datasource.password=digipower 
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver 
# Druid éç½® 
# åå§åæ¶å»ºç«ç©çè¿æ¥çä¸ªæ° 
spring.datasource.druid.initial-size=5 
# æ大è¿æ¥æ± æ°é 
spring.datasource.druid.max-active=30 
# æå°è¿æ¥æ± æ°é 
spring.datasource.druid.min-idle=5 
# è·åè¿æ¥æ¶æ大ç­å¾æ¶é´ï¼åä½æ¯«ç§ 
spring.datasource.druid.max-wait=60000 
# éç½®é´éå¤ä¹æè¿è¡ä¸æ¬¡æ£æµï¼æ£æµéè¦å³é­ç空é²è¿æ¥ï¼åä½æ¯æ¯«ç§ 
spring.datasource.druid.time-between-eviction-runs-millis=60000 
# è¿æ¥ä¿æ空é²èä¸è¢«é©±éçæå°æ¶é´ 
spring.datasource.druid.min-evictable-idle-time-millis=300000 
# ç¨æ¥æ£æµè¿æ¥æ¯å¦ææçsqlï¼è¦æ±æ¯ä¸ä¸ªæ¥è¯¢è¯­å¥ 
spring.datasource.druid.validation-query=SELECT 1 FROM DUAL 
# 建议é置为trueï¼ä¸å½±åæ§è½ï¼å¹¶ä¸ä¿è¯å®å¨æ§ãç³è¯·è¿æ¥çæ¶åæ£æµï¼å¦æ空é²æ¶é´å¤§äºtimeBetweenEvictionRunsMillisï¼æ§è¡validationQueryæ£æµè¿æ¥æ¯å¦ææã 
spring.datasource.druid.test-while-idle=true 
# ç³è¯·è¿æ¥æ¶æ§è¡validationQueryæ£æµè¿æ¥æ¯å¦ææï¼åäºè¿ä¸ªéç½®ä¼éä½æ§è½ã 
spring.datasource.druid.test-on-borrow=false 
# å½è¿è¿æ¥æ¶æ§è¡validationQueryæ£æµè¿æ¥æ¯å¦ææï¼åäºè¿ä¸ªéç½®ä¼éä½æ§è½ã 
spring.datasource.druid.test-on-return=false 
# æ¯å¦ç¼å­preparedStatementï¼ä¹å°±æ¯PSCacheãPSCache对æ¯æ游æ çæ°æ®åºæ§è½æå巨大ï¼æ¯å¦è¯´oracleãå¨mysqlä¸å»ºè®®å³é­ã 
spring.datasource.druid.pool-prepared-statements=true 
# è¦å¯ç¨PSCacheï¼å¿é¡»é置大äº0ï¼å½å¤§äº0æ¶ï¼poolPreparedStatementsèªå¨è§¦åä¿®æ¹ä¸ºtrueã 
spring.datasource.druid.max-pool-prepared-statement-per-connection-size=50 
# éç½®çæ§ç»è®¡æ¦æªçfiltersï¼å»æåçæ§çé¢sqlæ æ³ç»è®¡ 
#spring.datasource.druid.filters=stat,wall 
# éè¿connectPropertieså±æ§æ¥æå¼mergeSqlåè½ï¼æ¢SQLè®°å½ 
spring.datasource.druid.connection-properties=druid.stat.mergeSql=true;druid.stat.slowSqlMillis=500 
# å并å¤ä¸ªDruidDataSourceççæ§æ°æ® 
spring.datasource.druid.use-global-data-source-stat=true 
# éç½®sql 注å¥æ¹å¼ 
spring.datasource.druid.filters=stat 
# æ¥å¿æ件éç½® 
logging.config=classpath:logback.xml

爬虫核心业务组件编写:

package com.zzg.reptile.component; 
 
import java.util.List; 
import java.util.regex.Matcher; 
import java.util.regex.Pattern; 
 
import org.springframework.stereotype.Component; 
 
import com.alibaba.fastjson.JSONArray; 
import com.alibaba.fastjson.JSONObject; 
 
import us.codecraft.webmagic.Page; 
import us.codecraft.webmagic.Site; 
import us.codecraft.webmagic.processor.PageProcessor; 
import us.codecraft.webmagic.selector.Html; 
import us.codecraft.webmagic.selector.Selectable; 
 
/** 
 * 百度新闻#国内焦点  
 * @author zzg 
 * 
 */ 
@Component 
public class BaiduNewPageProcessor implements PageProcessor { 
	 
	//解析数据 
	private JSONArray array = new JSONArray(); 
	 
	public JSONArray getArray() { 
		return array; 
	} 
 
	public void setArray(JSONArray array) { 
		this.array = array; 
	} 
 
	// 站点信息 
	String domain ="news.baidu.com"; 
	Integer sleepTime = 1000; 
	Integer retryTime = 30; 
	String charset ="utf-8"; 
	Integer timeOut = 30000; 
	String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"; 
	 
	//  
	 
	// 初始化站点 
	private Site site = Site.me().setDomain(domain).setSleepTime(sleepTime) 
            .setRetryTimes(retryTime).setCharset(charset).setTimeOut(timeOut).setUserAgent(userAgent); 
	 
	@Override 
	public void process(Page page) { 
		// TODO Auto-generated method stub 
		Html html = page.getHtml(); 
		if(html != null){ 
			List<Selectable> selectables = html.xpath("//div[@class='hotnews']/ul/li/strong/").nodes(); 
			 
			for(int i = 0; i < selectables.size(); i ++){ 
				Selectable selectable = selectables.get(i); 
				String content = selectable.toString(); 
				Pattern pattern = Pattern.compile("^<i"); 
				Matcher matcher = pattern.matcher(content); 
				if(matcher.find()){ 
					selectables.remove(i); 
				} 
			} 
			 
			for(int i =0; i < selectables.size(); i ++){ 
				// 解析数据接收 
				JSONObject jsonObject = new JSONObject(); 
				 
				Selectable selectable = selectables.get(i); 
				String url = selectable.links().toString(); 
				System.out.println("url is:" + url); 
				// 设置访问url 
				jsonObject.put("url", url); 
				String context = selectable.toString(); 
				Pattern pattern = Pattern.compile(".*(<b>).*"); 
				Matcher matcher = pattern.matcher(context); 
				if(matcher.find()){ 
					String title = selectable.xpath("/a/b/text()").get(); 
					// 设置访问标题 
					jsonObject.put("title", title); 
				} else { 
					String title = selectable.xpath("a/text()").get(); 
					// 设置访问标题 
					jsonObject.put("title", title); 
				} 
				array.add(jsonObject); 
			} 
 
			 
			 
			 
		} 
	} 
 
	@Override 
	public Site getSite() { 
		// TODO Auto-generated method stub 
		return site; 
	} 
 
} 

爬虫功能测试:

package com.zzg.reptile.controller; 
 
import org.springframework.beans.factory.annotation.Autowired; 
import org.springframework.stereotype.Controller; 
import org.springframework.web.bind.annotation.RequestMapping; 
import org.springframework.web.bind.annotation.RequestMethod; 
import org.springframework.web.bind.annotation.ResponseBody; 
 
import com.alibaba.fastjson.JSONArray; 
import com.zzg.jreport.response.JreportResponse; 
import com.zzg.reptile.component.BaiduNewPageProcessor; 
 
import io.swagger.annotations.Api; 
import io.swagger.annotations.ApiOperation; 
import us.codecraft.webmagic.Spider; 
 
@Controller 
@RequestMapping("/api/webmagic") 
@Api(value = "爬虫Controlle", tags = "爬虫操作服务") 
public class WebmagicController { 
	@Autowired 
	private BaiduNewPageProcessor processor; 
 
	@ApiOperation(httpMethod = "POST", value = "新闻信息爬取") 
	@RequestMapping(value = "/news", method = { RequestMethod.POST }, produces = "application/json;charset=UTF-8") 
	@ResponseBody 
	public JreportResponse news() { 
		String url = "https://news.baidu.com/?cmd=1&class=civilnews&tn=rss&sub=0"; 
		Spider.create(processor).addUrl(url).run(); 
		 
		JSONArray array = processor.getArray(); 
		return JreportResponse.ok(array); 
	} 
 
} 

项目结构:

标签:爬虫
声明

1.本站遵循行业规范,任何转载的稿件都会明确标注作者和来源;2.本站的原创文章,请转载时务必注明文章作者和来源,不尊重原创的行为我们将追究责任;3.作者投稿可能会经我们编辑修改或补充。

关注我们

一个IT知识分享的公众号