案例功能效果图
爬去数据的平台页面
这个案例能爬取的平台太多了,我没有全部截图出来,想看的你们自己下载源码自己跑起来!
爬取的热榜数据效果图
环境介绍
前端:vue+h5
后端:springboot+webMagic
jdk:1.8及以上
数据库:mysql
完整源码获取方式
源码获取方式:点击这里,暗号博客园!
核心代码介绍
pom.
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-core --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency> <!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-extension --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> <!-- https://mvnrepository.com/artifact/com.google.guava/guava --> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> <version>18.0</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 --> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>3.4</version> </dependency> <!-- https://mvnrepository.com/artifact/commons-io/commons-io --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency> <!-- https://mvnrepository.com/artifact/org.projectlombok/lombok 代码省略工具--> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <version>1.18.8</version> <scope>provided</scope> </dependency> <!-- https://mvnrepository.com/artifact/junit/junit --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> <!-- swagger2 --> <dependency> <groupId>io.springfox</groupId> <artifactId>springfox-swagger2</artifactId> <version>2.9.1</version> </dependency> <dependency> <groupId>io.springfox</groupId> <artifactId>springfox-swagger-ui</artifactId> <version>2.9.1</version> </dependency>
application.yml
server: port: 9004spring: jackson: serialization: write-dates-as-timestamps: true datasource: driverClassName: com.mysql.cj.jdbc.Driver url: jdbc:mysql://feimeidehuoji:3306/feimeidehuoji?autoReconnect=true&useUnicode=true&characterEncoding=UTF-8&useSSL=false&useLegacyDatetimeCode=false&serverTimezone=UTC username: feimeidehuoji password: feimeidehuoji jpa: database: MySQL show-sql: true hibernate: ddl-auto: update database-platform: org.hibernate.dialect.MySQL5InnoDBDialectspiderUrl: https://tophub.todayproxyUrl: 61.160.210.234proxyPort: 808
NodeController.java
package cn.cesi.webMagic.webMagic;import cn.cesi.webMagic.pieline.SpringPieline;import cn.cesi.webMagic.pojo.Node;import cn.cesi.webMagic.service.NodeService;import cn.cesi.webMagic.util.Result;import cn.cesi.webMagic.util.StatusCode;import io.swagger.annotations.Api;import io.swagger.annotations.ApiOperation;import io.swagger.annotations.ApiParam;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.beans.factory.annotation.Value;import org.springframework.data.domain.Page;import org.springframework.scheduling.annotation.Scheduled;import org.springframework.web.bind.annotation.CrossOrigin;import org.springframework.web.bind.annotation.RequestMapping;import org.springframework.web.bind.annotation.RequestParam;import org.springframework.web.bind.annotation.RestController;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.downloader.HttpClientDownloader;import us.codecraft.webmagic.proxy.Proxy;import us.codecraft.webmagic.proxy.SimpleProxyProvider;import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;import us.codecraft.webmagic.scheduler.QueueScheduler;import javax.annotation.Resource;import java.util.List;import java.util.Map;@RestController@CrossOrigin@RequestMapping("/node")@Api(value = "获取数据接口",tags={"用户登录接口"})public class NodeController { @Value("${spiderUrl}") private String url; @Value("${proxyUrl}") private String proxyUrl; @Value("${proxyPort}") private Integer proxyPort; @Resource NodeService nodeService; @Autowired SpringPieline springPieline; @RequestMapping("") @ApiOperation(value = "查询数据接口") public Result getData( @ApiParam(value = "分类名称", required = false) String typeName ,@ApiParam(value = "分类名称", required = false) String secondTitle ,@ApiParam(value = "当前页", required = false)Integer page ,@ApiParam(value = "每页数据条数", required = false)Integer size){ Page<Node> nodes = nodeService.searchData(typeName, secondTitle,page, size); Result result = new Result(); result.setFlag(true); result.setCode(StatusCode.OK); result.setMsg("查询成功!"); result.setData(nodes); return result; } @RequestMapping("/getType") @ApiOperation(value = "查询全部分类列表") public Result getData(){ List<Map<String,String>> list = nodeService.findType(); Result result = new Result(); result.setFlag(true); result.setCode(StatusCode.OK); result.setMsg("查询成功!"); result.setData(list); return result; } @Scheduled(fixedDelay = 480000) //1000*60*8 任务执行完成后10分钟继续执行 public void tasks(){ System.out.println("定时任务开始——————————————————————————————————"); //设置代理服务器 HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(proxyUrl,proxyPort))); Spider.create(new WebProcess()) .addUrl(url) .setDownloader(httpClientDownloader) .thread(2) //线程(程序爬取速度) .addPipeline(springPieline) //指定pieline接口 .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000*10))) .run(); System.out.println("定时任务结束——————————————————————————————————"); }}
WebProcess.java
package cn.cesi.webMagic.webMagic;import cn.cesi.webMagic.pieline.SpringPieline;import cn.cesi.webMagic.util.NodeEntity;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.beans.factory.annotation.Value;import org.springframework.scheduling.annotation.Scheduled;import org.springframework.stereotype.Component;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.downloader.HttpClientDownloader;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.proxy.Proxy;import us.codecraft.webmagic.proxy.SimpleProxyProvider;import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;import us.codecraft.webmagic.scheduler.QueueScheduler;import us.codecraft.webmagic.selector.Selectable;import org.jsoup.select.Elements;import java.util.*;@Componentpublic class WebProcess implements PageProcessor { @Override public void process(Page page) { System.out.println(page.getHtml()); //page页面对象,getHtml()获取页面的html ,css()选择器 div#Sortable 获取id为Sortable的div元素 nodes()转为集合 List<Selectable> list = page.getHtml().css("div.bc div#Sortable div.cc-cd div").nodes(); List<NodeEntity> nodes = new ArrayList<>(); for(Selectable selectable : list){ //regex 正则表达式// String name = Jsoup.parse(selectable.css("div.cc-cd-ih div a div span").regex(".*微博.*").all().toString()).text(); //标题 //Jsoup.parse解析html为dom元素(对象)语法同js语法 text()为js语法不多解释 //获取title大标题 String s = selectable.css("div.cc-cd-ih div a div span").toString(); String title = ""; if(s != null){ title = Jsoup.parse(s).text(); } //获取logo String logo = selectable.css("div.cc-cd-ih div a div img").toString(); String logoSrc = ""; if(logo != null){ Document document = Jsoup.parse(logo); Elements imgTags = document.select("img[src]"); logoSrc = imgTags.attr("src"); } //获取第二层小标题的集合 List<Selectable> list2 = selectable.css("div.cc-cd-cb div a").nodes(); List<Map<String,String>> maps = new ArrayList<>(); for(Selectable selectable2 :list2){ Map<String,String> map = new HashMap<>(); //获取二级标题的链接 String url = selectable2.links().toString(); //获取二级标题 String secondTitle = Jsoup.parse(selectable2.css("div span.t").toString()).text(); //获取文章热度 String hot = ""; if(selectable2.css("div span.e") != null){ hot = Jsoup.parse(selectable2.css("div span.e").toString()).text(); } map.put("url",url); map.put("secondTitle",secondTitle); map.put("hot",hot); maps.add(map); //将连接添加入任务中 //page.addTargetRequest(url); } NodeEntity node = new NodeEntity(); node.setTitle(title); node.setLogo(logoSrc); node.setMaps(maps); nodes.add(node); } //给page对象绑定对象 page.putField("nodes",nodes); } private Site site = Site.me() .setSleepTime(2)//抓取间隔时间,可以解决一些反爬限制 .setRetryTimes(3) //重试次数 .setRetrySleepTime(10000) //重试时间 .setTimeOut(60000) //超时时间 1000*60 1分钟 .setCharset("utf8"); @Override public Site getSite() { return site; }}
SpringPieline.java
package cn.cesi.webMagic.pieline;import cn.cesi.webMagic.pojo.Node;import cn.cesi.webMagic.service.NodeService;import cn.cesi.webMagic.util.IdWorker;import cn.cesi.webMagic.util.NodeEntity;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.stereotype.Component;import us.codecraft.webmagic.ResultItems;import us.codecraft.webmagic.Task;import us.codecraft.webmagic.pipeline.Pipeline;import java.util.*;//存入数据库@Componentpublic class SpringPieline implements Pipeline { @Autowired NodeService nodeService; @Autowired IdWorker idWorker; @Override public void process(ResultItems resultItems, Task task) { List<NodeEntity> nodes = resultItems.get("nodes"); try{ for(NodeEntity entity : nodes){ Node node = new Node(); String title = entity.getTitle(); node.setTitle(title); String logo = entity.getLogo(); node.setLogo(logo); List<Map<String,String>> list = entity.getMaps(); for(Map<String,String> map : list){ node.setId(idWorker.nextId()+""); String secondTitle = map.get("secondTitle").trim(); node.setSecondTitle(secondTitle); node.setUrl(map.get("url")); node.setCreateDate(new Date()); node.setHot(map.get("hot")); System.out.println(secondTitle); if(!secondTitle.equals("") && !title.equals("")){ List<Node> byTitleAndSecondTitle = nodeService.findByTitleAndSecondTitle(title, secondTitle); if(byTitleAndSecondTitle.size() <= 0){ nodeService.save(node); } } } } }catch (Exception e){ System.out.println(e); } }}
index.vue
<template> <div > <h1 >摸鱼热榜</h1> <van-search v-model="value" placeholder="请输入搜索关键词" @search="onSearch" @clear="onClear" /> <!-- 分类列表 --> <div v-if="!listData.length"> <div > 仿今日热榜!,关注java项目开发,学习更多案例! </div> <div > <div> <div >全部热榜</div> <div > <div v-for="(item, index) in typeList" :key="index" > <div @click="goDateils(item)"> <div > <img :src="item.logo" :alt="item.title" @error="imgError(item)" /> </div> <div >{{ item.title }}</div> <div > <svg-icon icon className="icon_search" ></svg-icon> </div> </div> </div> </div> </div> </div> </div> <!-- 搜索内容 --> <div v-if="listData.length"> <search-list v-if="listData.length" :list="listData" /> <van-empty v-else description="暂无相关内容!" /> </div> </div></template><script>import SvgIcon from '@/components/icon/SvgIcon';import searchList from '@/components/searchList/list';export default { components: { SvgIcon, searchList }, data() { return { value: '', // 搜索值 listData: [], // 搜索数据 typeList: [], // 所有热榜类型 defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 默认类型图片 }; }, computed: {}, created() { this.getAllType(); }, mounted() {}, methods: { // 获取全部热榜类型 getAllType() { const that = this; this.$api.getAllType().then(res => { if (res.code === 0) { that.typeList = res.data; } }); }, // 跳转分类详情 goDateils(item) { this.$router.push({ name: 'details', query: { item: JSON.stringify(item) } }); }, // 搜索 onSearch(e) { const that = this; let params = { typeName: '全部', size: 10000, secondTitle: e }; this.$api.getAllInfoGzip(params).then(res => { if (res.code == 0) { that.listData = res.data.content; that.handleData(that.listData); console.log(res); } }); }, // 清除搜索框 onClear(e) { this.listData = []; }, // 处理热榜类型数据 handleData(data) { data.forEach(item => { item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新信息 item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time; }); }, // 图片404处理 imgError(item) { // 图片404就赋值默认图片 item.logo = this.defaultUrl; } }};
details.vue
<template> <div > <div > <img :src="details.logo" @error="imgError" alt="" /> <div > <div > <p @click="$router.push('/')">摸鱼热榜</p> </div> <img :src="details.logo" @error="imgError" alt="" /> <h1 >{{ details.title }}</h1> </div> </div> <div > <van-pull-refresh v-model="refreshing" @refresh="onRefresh"> <van-list v-model="loading" :finished="finished" @load="onLoad" :immediate-check="false" > <div > <a v-for="(item, index) in listData" :key="item.id" :href="item.url" > <div > <h4 > {{ index + 1 }}、{{ item.secondTitle }} </h4> <div > <span v-if="item.hot"> <span>{{ item.hot }}</span> </span> <span > <span>{{ item.CreateTime }}</span> </span> <span v-if="item.new">新</span> </div> </div> </a> </div> </van-list> </van-pull-refresh> </div> <div v-if="finished"> <p >我是有底线的</p> </div> </div></template><script>export default { data() { return { page: 1, // 当前页数 refreshing: false, // 下拉刷新状态 loading: false, // 上拉加载状态 finished: false, // 是否无更多数据状态 listData: [], // 数据列表 details: {}, // 类型详情 defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 默认类型图片 }; }, computed: {}, created() {}, mounted() { this.details = JSON.parse(this.$route.query.item); this.getList(this.details, this.page); }, methods: { // 分类详情 getList(item, page, loading = true) { const that = this; let list = that.listData; let params = { typeName: item.title, size: 50, page }; this.$api.getAllInfoGzip(params, loading).then(res => { console.log(res); if (res.code == 0) { that.listData = list.concat(res.data.content); that.handleData(that.listData); // 上拉加载状态结束 if (that.loading) { that.loading = false; } // 下拉刷新状态结束 if (that.refreshing) { that.refreshing = false; } // 暂无更多数据 if (that.page >= res.data.totalPages) { that.finished = true; } } }); }, // 上拉加载 onLoad() { // 请求状态 this.loading = true; this.getList(this.details, ++this.page, false); }, // 下拉刷新 onRefresh() { // 请求状态、清空列表数据 this.finished = false; this.loading = true; this.listData = []; this.page = 1; this.getList(this.details, 1, false); }, // 处理热榜类型数据 handleData(data) { data.forEach(item => { item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新信息 item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time; }); }, // 图片404处理 imgError() { // 图片404就赋值默认图片 this.details.img = this.defaultUrl; } }};
xxx.sql
SET NAMES utf8mb4;SET FOREIGN_KEY_CHECKS = 0;-- ------------------------------ Table structure for node-- ----------------------------DROP TABLE IF EXISTS `node`;CREATE TABLE `node` ( `id` varchar(255) NOT NULL, `create_date` datetime DEFAULT NULL, `hot` varchar(1024) DEFAULT NULL, `second_title` longtext, `title` varchar(1024) DEFAULT NULL, `url` longtext, `logo` varchar(1024) DEFAULT NULL, PRIMARY KEY (`id`)) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
作者:Java开发项目
链接:https://mp.weixin.qq.com/s/z9J1gL7orSL90ngSQeRRhg
欢迎大家关注:有故事的程序员,每天更新Java技术知识点,还可以领取Java进阶学习资料哦~
资料包含的模块分为19个模块,分别是: Java 基础、容器、多线程、反射、对象拷贝、Java Web 、异常、网络、设计模式、Spring/Spring MVC、Spring Boot/Spring Cloud、Hibernate、MyBatis、RabbitMQ、Kafka、Zookeeper、MySQL、Redis、JVM 。
原文转载:http://www.shaoqun.com/a/502819.html
出口易:https://www.ikjzd.com/w/1317
友家快递:https://www.ikjzd.com/w/1341
爬取各大热门APP案例功能效果图爬去数据的平台页面这个案例能爬取的平台太多了,我没有全部截图出来,想看的你们自己下载源码自己跑起来!爬取的热榜数据效果图环境介绍前端:vue+h5后端:springboot+webMagicjdk:1.8及以上数据库:mysql完整源码获取方式源码获取方式:点击这里,暗号博客园!核心代码介绍pom.<!--https://mvnrepository.com/a
汇通天下:汇通天下
weebly:weebly
适合去稻城亚丁的季节 稻城亚丁最佳旅游季节:适合去稻城亚丁的季节 稻城亚丁最佳旅游季节
青城山特产:长生宴 - :青城山特产:长生宴 -
2020五一期间去黄腾峡漂流注意哪些事项呢?:2020五一期间去黄腾峡漂流注意哪些事项呢?
No comments:
Post a Comment