2020-12-23

spring boot+vue实现爬取各大平台每日热榜数据功能

爬取各大热门APP

案例功能效果图
爬去数据的平台页面

这个案例能爬取的平台太多了,我没有全部截图出来,想看的你们自己下载源码自己跑起来!
爬取的热榜数据效果图

环境介绍
前端:vue+h5
后端:springboot+webMagic
jdk:1.8及以上
数据库:mysql

完整源码获取方式
源码获取方式:点击这里,暗号博客园!

核心代码介绍
pom.

<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-core -->  <dependency>   <groupId>us.codecraft</groupId>   <artifactId>webmagic-core</artifactId>   <version>0.7.3</version>   <exclusions>    <exclusion>     <groupId>org.slf4j</groupId>     <artifactId>slf4j-log4j12</artifactId>    </exclusion>   </exclusions>  </dependency>  <!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-extension -->  <dependency>   <groupId>us.codecraft</groupId>   <artifactId>webmagic-extension</artifactId>   <version>0.7.3</version>  </dependency>  <!-- https://mvnrepository.com/artifact/com.google.guava/guava -->  <dependency>   <groupId>com.google.guava</groupId>   <artifactId>guava</artifactId>   <version>18.0</version>  </dependency>  <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->  <dependency>   <groupId>org.apache.commons</groupId>   <artifactId>commons-lang3</artifactId>   <version>3.4</version>  </dependency>  <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->  <dependency>   <groupId>commons-io</groupId>   <artifactId>commons-io</artifactId>   <version>2.4</version>  </dependency>  <!-- https://mvnrepository.com/artifact/org.projectlombok/lombok 代码省略工具-->  <dependency>   <groupId>org.projectlombok</groupId>   <artifactId>lombok</artifactId>   <version>1.18.8</version>   <scope>provided</scope>  </dependency>  <!-- https://mvnrepository.com/artifact/junit/junit -->  <dependency>   <groupId>junit</groupId>   <artifactId>junit</artifactId>   <version>4.12</version>   <scope>test</scope>  </dependency>  <!-- swagger2 -->  <dependency>   <groupId>io.springfox</groupId>   <artifactId>springfox-swagger2</artifactId>   <version>2.9.1</version>  </dependency>  <dependency>   <groupId>io.springfox</groupId>   <artifactId>springfox-swagger-ui</artifactId>   <version>2.9.1</version>  </dependency>

application.yml

server: port: 9004spring: jackson: serialization:  write-dates-as-timestamps: true datasource: driverClassName: com.mysql.cj.jdbc.Driver url: jdbc:mysql://feimeidehuoji:3306/feimeidehuoji?autoReconnect=true&useUnicode=true&characterEncoding=UTF-8&useSSL=false&useLegacyDatetimeCode=false&serverTimezone=UTC username: feimeidehuoji password: feimeidehuoji jpa: database: MySQL show-sql: true hibernate:  ddl-auto: update database-platform: org.hibernate.dialect.MySQL5InnoDBDialectspiderUrl: https://tophub.todayproxyUrl: 61.160.210.234proxyPort: 808

NodeController.java

package cn.cesi.webMagic.webMagic;import cn.cesi.webMagic.pieline.SpringPieline;import cn.cesi.webMagic.pojo.Node;import cn.cesi.webMagic.service.NodeService;import cn.cesi.webMagic.util.Result;import cn.cesi.webMagic.util.StatusCode;import io.swagger.annotations.Api;import io.swagger.annotations.ApiOperation;import io.swagger.annotations.ApiParam;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.beans.factory.annotation.Value;import org.springframework.data.domain.Page;import org.springframework.scheduling.annotation.Scheduled;import org.springframework.web.bind.annotation.CrossOrigin;import org.springframework.web.bind.annotation.RequestMapping;import org.springframework.web.bind.annotation.RequestParam;import org.springframework.web.bind.annotation.RestController;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.downloader.HttpClientDownloader;import us.codecraft.webmagic.proxy.Proxy;import us.codecraft.webmagic.proxy.SimpleProxyProvider;import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;import us.codecraft.webmagic.scheduler.QueueScheduler;import javax.annotation.Resource;import java.util.List;import java.util.Map;@RestController@CrossOrigin@RequestMapping("/node")@Api(value = "获取数据接口",tags={"用户登录接口"})public class NodeController { @Value("${spiderUrl}") private String url; @Value("${proxyUrl}") private String proxyUrl; @Value("${proxyPort}") private Integer proxyPort; @Resource NodeService nodeService; @Autowired SpringPieline springPieline; @RequestMapping("") @ApiOperation(value = "查询数据接口") public Result getData(   @ApiParam(value = "分类名称", required = false) String typeName   ,@ApiParam(value = "分类名称", required = false) String secondTitle   ,@ApiParam(value = "当前页", required = false)Integer page   ,@ApiParam(value = "每页数据条数", required = false)Integer size){  Page<Node> nodes = nodeService.searchData(typeName, secondTitle,page, size);  Result result = new Result();  result.setFlag(true);  result.setCode(StatusCode.OK);  result.setMsg("查询成功!");  result.setData(nodes);  return result; } @RequestMapping("/getType") @ApiOperation(value = "查询全部分类列表") public Result getData(){  List<Map<String,String>> list = nodeService.findType();  Result result = new Result();  result.setFlag(true);  result.setCode(StatusCode.OK);  result.setMsg("查询成功!");  result.setData(list);  return result; } @Scheduled(fixedDelay = 480000) //1000*60*8 任务执行完成后10分钟继续执行 public void tasks(){  System.out.println("定时任务开始——————————————————————————————————");  //设置代理服务器  HttpClientDownloader httpClientDownloader = new HttpClientDownloader();  httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(proxyUrl,proxyPort)));  Spider.create(new WebProcess())    .addUrl(url)    .setDownloader(httpClientDownloader)    .thread(2) //线程(程序爬取速度)    .addPipeline(springPieline) //指定pieline接口    .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000*10)))    .run();  System.out.println("定时任务结束——————————————————————————————————"); }}

WebProcess.java

package cn.cesi.webMagic.webMagic;import cn.cesi.webMagic.pieline.SpringPieline;import cn.cesi.webMagic.util.NodeEntity;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.beans.factory.annotation.Value;import org.springframework.scheduling.annotation.Scheduled;import org.springframework.stereotype.Component;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.downloader.HttpClientDownloader;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.proxy.Proxy;import us.codecraft.webmagic.proxy.SimpleProxyProvider;import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;import us.codecraft.webmagic.scheduler.QueueScheduler;import us.codecraft.webmagic.selector.Selectable;import org.jsoup.select.Elements;import java.util.*;@Componentpublic class WebProcess implements PageProcessor { @Override public void process(Page page) {  System.out.println(page.getHtml());  //page页面对象,getHtml()获取页面的html ,css()选择器 div#Sortable 获取id为Sortable的div元素 nodes()转为集合  List<Selectable> list = page.getHtml().css("div.bc div#Sortable div.cc-cd div").nodes();  List<NodeEntity> nodes = new ArrayList<>();  for(Selectable selectable : list){   //regex 正则表达式//   String name = Jsoup.parse(selectable.css("div.cc-cd-ih div a div span").regex(".*微博.*").all().toString()).text(); //标题   //Jsoup.parse解析html为dom元素(对象)语法同js语法 text()为js语法不多解释   //获取title大标题   String s = selectable.css("div.cc-cd-ih div a div span").toString();   String title = "";   if(s != null){    title = Jsoup.parse(s).text();   }   //获取logo   String logo = selectable.css("div.cc-cd-ih div a div img").toString();   String logoSrc = "";   if(logo != null){    Document document = Jsoup.parse(logo);    Elements imgTags = document.select("img[src]");    logoSrc = imgTags.attr("src");   }   //获取第二层小标题的集合   List<Selectable> list2 = selectable.css("div.cc-cd-cb div a").nodes();   List<Map<String,String>> maps = new ArrayList<>();   for(Selectable selectable2 :list2){    Map<String,String> map = new HashMap<>();    //获取二级标题的链接    String url = selectable2.links().toString();    //获取二级标题    String secondTitle = Jsoup.parse(selectable2.css("div span.t").toString()).text();    //获取文章热度    String hot = "";    if(selectable2.css("div span.e") != null){     hot = Jsoup.parse(selectable2.css("div span.e").toString()).text();    }    map.put("url",url);    map.put("secondTitle",secondTitle);    map.put("hot",hot);    maps.add(map);    //将连接添加入任务中    //page.addTargetRequest(url);   }   NodeEntity node = new NodeEntity();   node.setTitle(title);   node.setLogo(logoSrc);   node.setMaps(maps);   nodes.add(node);  }  //给page对象绑定对象  page.putField("nodes",nodes); } private Site site = Site.me()   .setSleepTime(2)//抓取间隔时间,可以解决一些反爬限制   .setRetryTimes(3) //重试次数   .setRetrySleepTime(10000) //重试时间   .setTimeOut(60000) //超时时间 1000*60 1分钟   .setCharset("utf8"); @Override public Site getSite() {  return site; }}

SpringPieline.java

package cn.cesi.webMagic.pieline;import cn.cesi.webMagic.pojo.Node;import cn.cesi.webMagic.service.NodeService;import cn.cesi.webMagic.util.IdWorker;import cn.cesi.webMagic.util.NodeEntity;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.stereotype.Component;import us.codecraft.webmagic.ResultItems;import us.codecraft.webmagic.Task;import us.codecraft.webmagic.pipeline.Pipeline;import java.util.*;//存入数据库@Componentpublic class SpringPieline implements Pipeline { @Autowired NodeService nodeService; @Autowired IdWorker idWorker; @Override public void process(ResultItems resultItems, Task task) {  List<NodeEntity> nodes = resultItems.get("nodes");  try{   for(NodeEntity entity : nodes){    Node node = new Node();    String title = entity.getTitle();    node.setTitle(title);    String logo = entity.getLogo();    node.setLogo(logo);    List<Map<String,String>> list = entity.getMaps();    for(Map<String,String> map : list){     node.setId(idWorker.nextId()+"");     String secondTitle = map.get("secondTitle").trim();     node.setSecondTitle(secondTitle);     node.setUrl(map.get("url"));     node.setCreateDate(new Date());     node.setHot(map.get("hot"));     System.out.println(secondTitle);     if(!secondTitle.equals("") && !title.equals("")){      List<Node> byTitleAndSecondTitle = nodeService.findByTitleAndSecondTitle(title, secondTitle);      if(byTitleAndSecondTitle.size() <= 0){       nodeService.save(node);      }     }    }   }  }catch (Exception e){   System.out.println(e);  } }}

index.vue

<template> <div > <h1 >摸鱼热榜</h1> <van-search  v-model="value"  placeholder="请输入搜索关键词"  @search="onSearch"  @clear="onClear" /> <!-- 分类列表 --> <div v-if="!listData.length">  <div >  仿今日热榜!,关注java项目开发,学习更多案例!  </div>  <div >  <div>   <div >全部热榜</div>   <div >   <div    v-for="(item, index) in typeList"    :key="index"       >    <div @click="goDateils(item)">    <div >     <img     :src="item.logo"     :alt="item.title"     @error="imgError(item)"     />    </div>    <div >{{ item.title }}</div>    <div >     <svg-icon     icon     className="icon_search"     ></svg-icon>    </div>    </div>   </div>   </div>  </div>  </div> </div> <!-- 搜索内容 --> <div v-if="listData.length">  <search-list v-if="listData.length" :list="listData" />  <van-empty v-else description="暂无相关内容!" /> </div> </div></template><script>import SvgIcon from '@/components/icon/SvgIcon';import searchList from '@/components/searchList/list';export default { components: { SvgIcon, searchList }, data() { return {  value: '', // 搜索值  listData: [], // 搜索数据  typeList: [], // 所有热榜类型  defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 默认类型图片 }; }, computed: {}, created() { this.getAllType(); }, mounted() {}, methods: { // 获取全部热榜类型 getAllType() {  const that = this;  this.$api.getAllType().then(res => {  if (res.code === 0) {   that.typeList = res.data;  }  }); }, // 跳转分类详情 goDateils(item) {  this.$router.push({  name: 'details',  query: {   item: JSON.stringify(item)  }  }); }, // 搜索 onSearch(e) {  const that = this;  let params = {  typeName: '全部',  size: 10000,  secondTitle: e  };  this.$api.getAllInfoGzip(params).then(res => {  if (res.code == 0) {   that.listData = res.data.content;   that.handleData(that.listData);   console.log(res);  }  }); }, // 清除搜索框 onClear(e) {  this.listData = []; }, // 处理热榜类型数据 handleData(data) {  data.forEach(item => {  item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新信息  item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time;  }); }, // 图片404处理 imgError(item) {  // 图片404就赋值默认图片  item.logo = this.defaultUrl; } }};

details.vue

<template> <div > <div >  <img :src="details.logo" @error="imgError" alt="" />  <div >  <div >   <p @click="$router.push('/')">摸鱼热榜</p>  </div>  <img :src="details.logo" @error="imgError" alt="" />  <h1 >{{ details.title }}</h1>  </div> </div> <div >  <van-pull-refresh v-model="refreshing" @refresh="onRefresh">  <van-list   v-model="loading"   :finished="finished"   @load="onLoad"   :immediate-check="false"  >   <div >   <a    v-for="(item, index) in listData"    :key="item.id"    :href="item.url"       >    <div >    <h4 >     {{ index + 1 }}、{{ item.secondTitle }}    </h4>    <div >     <span v-if="item.hot">     <span>{{ item.hot }}</span>     </span>     <span >     <span>{{ item.CreateTime }}</span>     </span>     <span v-if="item.new">新</span>    </div>    </div>   </a>   </div>  </van-list>  </van-pull-refresh> </div> <div v-if="finished">  <p >我是有底线的</p> </div> </div></template><script>export default { data() { return {  page: 1, // 当前页数  refreshing: false, // 下拉刷新状态  loading: false, // 上拉加载状态  finished: false, // 是否无更多数据状态  listData: [], // 数据列表  details: {}, // 类型详情  defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 默认类型图片 }; }, computed: {}, created() {}, mounted() { this.details = JSON.parse(this.$route.query.item); this.getList(this.details, this.page); }, methods: { // 分类详情 getList(item, page, loading = true) {  const that = this;  let list = that.listData;  let params = {  typeName: item.title,  size: 50,  page  };  this.$api.getAllInfoGzip(params, loading).then(res => {  console.log(res);  if (res.code == 0) {   that.listData = list.concat(res.data.content);   that.handleData(that.listData);   // 上拉加载状态结束   if (that.loading) {   that.loading = false;   }   // 下拉刷新状态结束   if (that.refreshing) {   that.refreshing = false;   }   // 暂无更多数据   if (that.page >= res.data.totalPages) {   that.finished = true;   }  }  }); }, // 上拉加载 onLoad() {  // 请求状态  this.loading = true;  this.getList(this.details, ++this.page, false); }, // 下拉刷新 onRefresh() {  // 请求状态、清空列表数据  this.finished = false;  this.loading = true;  this.listData = [];  this.page = 1;  this.getList(this.details, 1, false); }, // 处理热榜类型数据 handleData(data) {  data.forEach(item => {  item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新信息  item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time;  }); }, // 图片404处理 imgError() {  // 图片404就赋值默认图片  this.details.img = this.defaultUrl; } }};

xxx.sql

SET NAMES utf8mb4;SET FOREIGN_KEY_CHECKS = 0;-- ------------------------------ Table structure for node-- ----------------------------DROP TABLE IF EXISTS `node`;CREATE TABLE `node` ( `id` varchar(255) NOT NULL, `create_date` datetime DEFAULT NULL, `hot` varchar(1024) DEFAULT NULL, `second_title` longtext, `title` varchar(1024) DEFAULT NULL, `url` longtext, `logo` varchar(1024) DEFAULT NULL, PRIMARY KEY (`id`)) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

作者:Java开发项目
链接:https://mp.weixin.qq.com/s/z9J1gL7orSL90ngSQeRRhg
欢迎大家关注:有故事的程序员,每天更新Java技术知识点,还可以领取Java进阶学习资料哦~
资料包含的模块分为19个模块,分别是: Java 基础、容器、多线程、反射、对象拷贝、Java Web 、异常、网络、设计模式、Spring/Spring MVC、Spring Boot/Spring Cloud、Hibernate、MyBatis、RabbitMQ、Kafka、Zookeeper、MySQL、Redis、JVM 。









原文转载:http://www.shaoqun.com/a/502819.html

跨境电商:https://www.ikjzd.com/

出口易:https://www.ikjzd.com/w/1317

友家快递:https://www.ikjzd.com/w/1341


爬取各大热门APP案例功能效果图爬去数据的平台页面这个案例能爬取的平台太多了,我没有全部截图出来,想看的你们自己下载源码自己跑起来!爬取的热榜数据效果图环境介绍前端:vue+h5后端:springboot+webMagicjdk:1.8及以上数据库:mysql完整源码获取方式源码获取方式:点击这里,暗号博客园!核心代码介绍pom.<!--https://mvnrepository.com/a
汇通天下:汇通天下
weebly:weebly
适合去稻城亚丁的季节 稻城亚丁最佳旅游季节:适合去稻城亚丁的季节 稻城亚丁最佳旅游季节
青城山特产:长生宴 - :青城山特产:长生宴 -
2020五一期间去黄腾峡漂流注意哪些事项呢?:2020五一期间去黄腾峡漂流注意哪些事项呢?

No comments:

Post a Comment