漯河做网站优化,哔哩哔哩官方网站首页,服装定制项目计划书,西安网页设计设计培训上一篇使用HtmlAgilityPack抓取壁纸数据成功将图片存入数据库#xff0c;本篇继续来完成一个全网各大平台的热点新闻数据的抓取。同样的#xff0c;可以先预览一下我个人博客中的成品#xff1a;https://meowv.com/hot ????????????#xff0c;和抓取壁纸的套路… 上一篇使用HtmlAgilityPack抓取壁纸数据成功将图片存入数据库本篇继续来完成一个全网各大平台的热点新闻数据的抓取。同样的可以先预览一下我个人博客中的成品https://meowv.com/hot ????????????和抓取壁纸的套路一样大同小异。本次要抓取的源有18个分别是博客园、V2EX、SegmentFault、掘金、微信热门、豆瓣精选、IT之家、36氪、百度贴吧、百度热搜、微博热搜、知乎热榜、知乎日报、网易新闻、GitHub、抖音热点、抖音视频、抖音正能量。还是将数据存入数据库按部就班先将实体类和自定义仓储创建好实体取名HotNews。贴一下代码//HotNews.cs
using System;
using Volo.Abp.Domain.Entities;namespace Meowv.Blog.Domain.HotNews
{public class HotNews : EntityGuid{/// summary/// 标题/// /summarypublic string Title { get; set; }/// summary/// 链接/// /summarypublic string Url { get; set; }/// summary/// SourceId/// /summarypublic int SourceId { get; set; }/// summary/// 创建时间/// /summarypublic DateTime CreateTime { get; set; }}
}
剩下的大家自己完成最终数据库生成一张空的数据表meowv_hotnews 。然后还是将我们各大平台放到一个枚举类HotNewsEnum.cs中。//HotNewsEnum.cs
using System.ComponentModel;namespace Meowv.Blog.Domain.Shared.Enum
{public enum HotNewsEnum{[Description(博客园)]cnblogs 1,[Description(V2EX)]v2ex 2,[Description(SegmentFault)]segmentfault 3,[Description(掘金)]juejin 4,[Description(微信热门)]weixin 5,[Description(豆瓣精选)]douban 6,[Description(IT之家)]ithome 7,[Description(36氪)]kr36 8,[Description(百度贴吧)]tieba 9,[Description(百度热搜)]baidu 10,[Description(微博热搜)]weibo 11,[Description(知乎热榜)]zhihu 12,[Description(知乎日报)]zhihudaily 13,[Description(网易新闻)]news163 14,[Description(GitHub)]github 15,[Description(抖音热点)]douyin_hot 16,[Description(抖音视频)]douyin_video 17,[Description(抖音正能量)]douyin_positive 18}
}
和上一篇抓取壁纸一样做一些准备工作。在.Application.Contracts层添加HotNewsJobItemT在.BackgroundJobs层添加HotNewsJob用来处理爬虫逻辑用构造函数方式注入仓储IHotNewsRepository。//HotNewsJobItem.cs
using Meowv.Blog.Domain.Shared.Enum;namespace Meowv.Blog.Application.Contracts.HotNews
{public class HotNewsJobItemT{/// summary/// see crefResult//// /summarypublic T Result { get; set; }/// summary/// 来源/// /summarypublic HotNewsEnum Source { get; set; }}
}
//HotNewsJob.CS
using Meowv.Blog.Domain.HotNews.Repositories;
using System;
using System.Net.Http;
using System.Threading.Tasks;namespace Meowv.Blog.BackgroundJobs.Jobs.HotNews
{public class HotNewsJob : IBackgroundJob{private readonly IHttpClientFactory _httpClient;private readonly IHotNewsRepository _hotNewsRepository;public HotNewsJob(IHttpClientFactory httpClient,IHotNewsRepository hotNewsRepository){_httpClient httpClient;_hotNewsRepository hotNewsRepository;}public async Task ExecuteAsync(){throw new NotImplementedException();}}
}
接下来明确数据源地址因为以上数据源有的返回是HTML有的直接返回JSON数据。为了方便调用我这里还注入了IHttpClientFactory。整理好的待抓取数据源列表是这样的。...
var hotnewsUrls new ListHotNewsJobItemstring
{new HotNewsJobItemstring { Result https://www.cnblogs.com, Source HotNewsEnum.cnblogs },new HotNewsJobItemstring { Result https://www.v2ex.com/?tabhot, Source HotNewsEnum.v2ex },new HotNewsJobItemstring { Result https://segmentfault.com/hottest, Source HotNewsEnum.segmentfault },new HotNewsJobItemstring { Result https://web-api.juejin.im/query, Source HotNewsEnum.juejin },new HotNewsJobItemstring { Result https://weixin.sogou.com, Source HotNewsEnum.weixin },new HotNewsJobItemstring { Result https://www.douban.com/group/explore, Source HotNewsEnum.douban },new HotNewsJobItemstring { Result https://www.ithome.com, Source HotNewsEnum.ithome },new HotNewsJobItemstring { Result https://36kr.com/newsflashes, Source HotNewsEnum.kr36 },new HotNewsJobItemstring { Result http://tieba.baidu.com/hottopic/browse/topicList, Source HotNewsEnum.tieba },new HotNewsJobItemstring { Result http://top.baidu.com/buzz?b341, Source HotNewsEnum.baidu },new HotNewsJobItemstring { Result https://s.weibo.com/top/summary/summary, Source HotNewsEnum.weibo },new HotNewsJobItemstring { Result https://www.zhihu.com/api/v3/feed/topstory/hot-lists/total?limit50desktoptrue, Source HotNewsEnum.zhihu },new HotNewsJobItemstring { Result https://daily.zhihu.com, Source HotNewsEnum.zhihudaily },new HotNewsJobItemstring { Result http://news.163.com/special/0001386F/rank_whole.html, Source HotNewsEnum.news163 },new HotNewsJobItemstring { Result https://github.com/trending, Source HotNewsEnum.github },new HotNewsJobItemstring { Result https://www.iesdouyin.com/web/api/v2/hotsearch/billboard/word, Source HotNewsEnum.douyin_hot },new HotNewsJobItemstring { Result https://www.iesdouyin.com/web/api/v2/hotsearch/billboard/aweme, Source HotNewsEnum.douyin_video },new HotNewsJobItemstring { Result https://www.iesdouyin.com/web/api/v2/hotsearch/billboard/aweme/?typepositive, Source HotNewsEnum.douyin_positive },
};
...
其中有几个比较特殊的掘金、百度热搜、网易新闻。掘金需要发送Post请求返回的是JSON数据并且需要指定特有的请求头和请求数据所以使用IHttpClientFactory创建了HttpClient对象。百度热搜、网易新闻两个老大哥玩套路网页编码是GB2312的所以要专门为其指定编码方式不然取到的数据都是乱码。...
var web new HtmlWeb();
var list_task new ListTaskHotNewsJobItemobject();hotnewsUrls.ForEach(item
{var task Task.Run(async () {var obj new object();if (item.Source HotNewsEnum.juejin){using var client _httpClient.CreateClient();client.DefaultRequestHeaders.Add(User-Agent, Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.14 Safari/537.36 Edg/83.0.478.13);client.DefaultRequestHeaders.Add(X-Agent, Juejin/Web);var data {\extensions\:{\query\:{ \id\:\21207e9ddb1de777adeaca7a2fb38030\}},\operationName\:\\,\query\:\\,\variables\:{ \first\:20,\after\:\\,\order\:\THREE_DAYS_HOTTEST\}};var buffer data.SerializeUtf8();var byteContent new ByteArrayContent(buffer);byteContent.Headers.ContentType new MediaTypeHeaderValue(application/json);var httpResponse await client.PostAsync(item.Result, byteContent);obj await httpResponse.Content.ReadAsStringAsync();}else{Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);obj await web.LoadFromWebAsync(item.Result, (item.Source HotNewsEnum.baidu || item.Source HotNewsEnum.news163) ? Encoding.GetEncoding(GB2312) : Encoding.UTF8);}return new HotNewsJobItemobject{Result obj,Source item.Source};});list_task.Add(task);
});
Task.WaitAll(list_task.ToArray());
循环 hotnewsUrls 可以看到HotNewsJobItem我们返回的是object类型因为有JSON又有HtmlDocument对象。所以这里为了能够统一接收就是用了object。针对掘金做了单独处理使用HttpClient发送Post请求返回JSON字符串数据。针对百度热搜和网易新闻使用Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);注册编码提供程序然后在web.LoadFromWebAsync(...)加载网页数据的时候指定网页编码我使用了一个三元表达式来处理。完成上面这一步就可以循环 list_task使用XPath语法或者解析JSON数据去拿到数据了。...
var hotNews new ListHotNews();
foreach (var list in list_task)
{var item await list;var sourceId (int)item.Source;...if (hotNews.Any()){await _hotNewsRepository.DeleteAsync(x true);await _hotNewsRepository.BulkInsertAsync(hotNews);}
}
这个爬虫同样很简单只要拿到标题和链接即可所以主要目标是寻找到页面上的a标签列表。这个我觉得也没必要一个个去分析了直接上代码。// 博客园if (item.Source HotNewsEnum.cnblogs){var nodes ((HtmlDocument)item.Result).DocumentNode.SelectNodes(//div[classpost_item_body]/h3/a).ToList();nodes.ForEach(x {hotNews.Add(new HotNews{Title x.InnerText,Url x.GetAttributeValue(href, ),SourceId sourceId,CreateTime DateTime.Now});});}
// V2EX
if (item.Source HotNewsEnum.v2ex)
{var nodes ((HtmlDocument)item.Result).DocumentNode.SelectNodes(//span[classitem_title]/a).ToList();nodes.ForEach(x {hotNews.Add(new HotNews{Title x.InnerText,Url $https://www.v2ex.com{x.GetAttributeValue(href, )},SourceId sourceId,CreateTime DateTime.Now});});
}// SegmentFaultif (item.Source HotNewsEnum.segmentfault){var nodes ((HtmlDocument)item.Result).DocumentNode.SelectNodes(//div[classnews__item-info clearfix]/a).Where(x x.InnerText.IsNotNullOrEmpty()).ToList();nodes.ForEach(x {hotNews.Add(new HotNews{Title x.SelectSingleNode(.//h4).InnerText,Url $https://segmentfault.com{x.GetAttributeValue(href, )},SourceId sourceId,CreateTime DateTime.Now});});}
// 掘金
if (item.Source HotNewsEnum.juejin)
{var obj JObject.Parse((string)item.Result);var nodes obj[data][articleFeed][items][edges];foreach (var node in nodes){hotNews.Add(new HotNews{Title node[node][title].ToString(),Url node[node][originalUrl].ToString(),SourceId sourceId,CreateTime DateTime.Now});}
}
// 微信热门
if (item.Source HotNewsEnum.weixin)
{var nodes ((HtmlDocument)item.Result).DocumentNode.SelectNodes(//ul[classnews-list]/li/div[classtxt-box]/h3/a).ToList();nodes.ForEach(x {hotNews.Add(new HotNews{Title x.InnerText,Url x.GetAttributeValue(href, ),SourceId sourceId,CreateTime DateTime.Now});});
}
// 豆瓣精选
if (item.Source HotNewsEnum.douban)
{var nodes ((HtmlDocument)item.Result).DocumentNode.SelectNodes(//div[classchannel-item]/div[classbd]/h3/a).ToList();nodes.ForEach(x {hotNews.Add(new HotNews{Title x.InnerText,Url x.GetAttributeValue(href, ),SourceId sourceId,CreateTime DateTime.Now});});
}
// IT之家
if (item.Source HotNewsEnum.ithome)
{var nodes ((HtmlDocument)item.Result).DocumentNode.SelectNodes(//div[classlst lst-2 hot-list]/div[1]/ul/li/a).ToList();nodes.ForEach(x {hotNews.Add(new HotNews{Title x.InnerText,Url x.GetAttributeValue(href, ),SourceId sourceId,CreateTime DateTime.Now});});
}
// 36氪
if (item.Source HotNewsEnum.kr36)
{var nodes ((HtmlDocument)item.Result).DocumentNode.SelectNodes(//div[classhotlist-main]/div[classhotlist-item-toptwo]/a[2]|//div[classhotlist-main]/div[classhotlist-item-other clearfloat]/div[classhotlist-item-other-info]/a).ToList();nodes.ForEach(x {hotNews.Add(new HotNews{Title x.InnerText,Url $https://36kr.com{x.GetAttributeValue(href, )},SourceId sourceId,CreateTime DateTime.Now});});
}
// 百度贴吧
if (item.Source HotNewsEnum.tieba)
{var obj JObject.Parse(((HtmlDocument)item.Result).ParsedText);var nodes obj[data][bang_topic][topic_list];foreach (var node in nodes){hotNews.Add(new HotNews{Title node[topic_name].ToString(),Url node[topic_url].ToString().Replace(amp;, ),SourceId sourceId,CreateTime DateTime.Now});}
}
// 百度热搜
if (item.Source HotNewsEnum.baidu)
{var nodes ((HtmlDocument)item.Result).DocumentNode.SelectNodes(//table[classlist-table]//tr/td[classkeyword]/a[classlist-title]).ToList();nodes.ForEach(x {hotNews.Add(new HotNews{Title x.InnerText,Url x.GetAttributeValue(href, ),SourceId sourceId,CreateTime DateTime.Now});});
}
// 微博热搜
if (item.Source HotNewsEnum.weibo)
{var nodes ((HtmlDocument)item.Result).DocumentNode.SelectNodes(//table/tbody/tr/td[2]/a).ToList();nodes.ForEach(x {hotNews.Add(new HotNews{Title x.InnerText,Url $https://s.weibo.com{x.GetAttributeValue(href, ).Replace(#, %23)},SourceId sourceId,CreateTime DateTime.Now});});
}
// 知乎热榜
if (item.Source HotNewsEnum.zhihu)
{var obj JObject.Parse(((HtmlDocument)item.Result).ParsedText);var nodes obj[data];foreach (var node in nodes){hotNews.Add(new HotNews{Title node[target][title].ToString(),Url $https://www.zhihu.com/question/{node[target][id]},SourceId sourceId,CreateTime DateTime.Now});}
}
// 知乎日报
if (item.Source HotNewsEnum.zhihudaily)
{var nodes ((HtmlDocument)item.Result).DocumentNode.SelectNodes(//div[classbox]/a).ToList();nodes.ForEach(x {hotNews.Add(new HotNews{Title x.InnerText,Url $https://daily.zhihu.com{x.GetAttributeValue(href, )},SourceId sourceId,CreateTime DateTime.Now});});
}
// 网易新闻
if (item.Source HotNewsEnum.news163)
{var nodes ((HtmlDocument)item.Result).DocumentNode.SelectNodes(//div[classarea-half left]/div[classtabBox]/div[classtabContents active]/table//tr/td[1]/a).ToList();nodes.ForEach(x {hotNews.Add(new HotNews{Title x.InnerText,Url x.GetAttributeValue(href, ),SourceId sourceId,CreateTime DateTime.Now});});
}
// GitHub
if (item.Source HotNewsEnum.github)
{var nodes ((HtmlDocument)item.Result).DocumentNode.SelectNodes(//article[classBox-row]/h1/a).ToList();nodes.ForEach(x {hotNews.Add(new HotNews{Title x.InnerText.Trim().Replace(\n, ).Replace( , ),Url $https://github.com{x.GetAttributeValue(href, )},SourceId sourceId,CreateTime DateTime.Now});});
}
// 抖音热点
if (item.Source HotNewsEnum.douyin_hot)
{var obj JObject.Parse(((HtmlDocument)item.Result).ParsedText);var nodes obj[word_list];foreach (var node in nodes){hotNews.Add(new HotNews{Title node[word].ToString(),Url $#{node[hot_value]},SourceId sourceId,CreateTime DateTime.Now});}
}
// 抖音视频 抖音正能量
if (item.Source HotNewsEnum.douyin_video || item.Source HotNewsEnum.douyin_positive)
{var obj JObject.Parse(((HtmlDocument)item.Result).ParsedText);var nodes obj[aweme_list];foreach (var node in nodes){hotNews.Add(new HotNews{Title node[aweme_info][desc].ToString(),Url node[aweme_info][share_url].ToString(),SourceId sourceId,CreateTime DateTime.Now});}
}
将item.Result转换成指定类型最终拿到数据后我们先删除所有数据后再批量插入。然后新建扩展方法UseHotNewsJob()在模块类中调用。//MeowvBlogBackgroundJobsExtensions.cs
.../// summary/// 每日热点数据抓取/// /summary/// param namecontext/parampublic static void UseHotNewsJob(this IServiceProvider service){var job service.GetServiceHotNewsJob();RecurringJob.AddOrUpdate(每日热点数据抓取, () job.ExecuteAsync(), CronType.Hour(1, 2));}
...
指定定时任务为每2小时运行一次。...public override void OnApplicationInitialization(ApplicationInitializationContext context){...var service context.ServiceProvider;...service.UseHotNewsJob();}
编译运行此时周期性作业就会出现我们的定时任务了。默认时间没到是不会执行的我们手动执行等待一会看看效果。执行完成后成功将所有热点数据保存在数据库中说明我们的爬虫已经搞定了并且Hangfire会按照给定的规则去循环执行你学会了吗????????????开源地址https://github.com/Meowv/Blog/tree/blog_tutorial