爬虫的封装HttpWebRequest类版本(后续说到HttpClient版本)
public class HttpHelper
{
private static Logger logger = new Logger(typeof(HttpHelper));
public static string DownloadUrl(string url)
{
return DownloadHtml(url, Encoding.UTF8);
}
public static string DownloadHtml(string url, Encoding encode)
{
string html = string.Empty;
try
{
HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;
request.Timeout = 30 * 1000;
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36";
request.ContentType = "text/html; charset=utf-8";
request.CookieContainer = new CookieContainer();
using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)
{
if (response.StatusCode != HttpStatusCode.OK)
{
logger.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode));
}
else
{
try
{
StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
html = sr.ReadToEnd();
sr.Close();
}
catch (Exception ex)
{
logger.Error(string.Format($"DownloadHtml抓取{url}失败"), ex);
html = null;
}
}
}
}
catch (System.Net.WebException ex)
{
if (ex.Message.Equals("远程服务器返回错误: (306)。"))
{
logger.Error("远程服务器返回错误: (306)。", ex);
html = null;
}
}
catch (Exception ex)
{
logger.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex);
html = null;
}
return html;
}
}
使用
public class CategorySearch : ISearch
{
private static Logger logger = new Logger(typeof(CategorySearch));
private int _Count = 1;
public void Crawler()
{
List<TencentCategoryEntity> categoryList = new List<TencentCategoryEntity>();
try
{
string url = $"{Constant.TencentClassUrl}/weixin_41181778";
string html = HttpHelper.DownloadUrl(url);
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
string fristPath = @"//*[@id=""mainBox""]/main/div[2]/div[1]/h4/a";
HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(fristPath);
if (nodeList == null)
{
}
foreach (HtmlNode node in nodeList)
{
categoryList.AddRange(this.First(node.InnerHtml, null));
}
CategoryRepository categoryRepository = new CategoryRepository();
categoryRepository.Save(categoryList);
}
catch (Exception ex)
{
logger.Error("CrawlerMuti出现异常", ex);
}
finally
{
Console.WriteLine($"类型数据初始化完成,共抓取类别{ categoryList?.Count}个");
}
}
private List<TencentCategoryEntity> First(string html, string parentCode)
{
List<TencentCategoryEntity> categoryList = new List<TencentCategoryEntity>();
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
string namePath = "//a/h2";
HtmlNode name = doc.DocumentNode.SelectSingleNode(namePath);
string codePath = "//a";
HtmlNode codeNode = doc.DocumentNode.SelectSingleNode(codePath);
string href = codeNode.Attributes["href"].Value;
string code = string.Empty;
if (href != null && href.IndexOf("mt=") != -1)
{
href = href.Replace(";", "&");
code = href.Substring(href.IndexOf("mt=") + 3, 4);
}
TencentCategoryEntity category = new TencentCategoryEntity()
{
Id = _Count++,
State = 1,
CategoryLevel = 1,
Code = code,
ParentCode = parentCode
};
category.Name = name.InnerText;
category.Url = href;
categoryList.Add(category);
if (name.InnerText != "全部")
{
categoryList.AddRange(this.Second($"{Constant.TencentClassUrl}{href}&tuin=7e4f8b7d", code));
}
return categoryList;
}
private List<TencentCategoryEntity> Second(string url, string parentCode)
{
string html = HttpHelper.DownloadUrl(url);
List<TencentCategoryEntity> categoryList = new List<TencentCategoryEntity>();
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
string path = "//*[@id='auto-test-1']/div[1]/dl/dd";
HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);
foreach (HtmlNode node in nodeList)
{
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(node.InnerHtml);
string codePath = "//a";
HtmlNode codeNode = htmlDocument.DocumentNode.SelectSingleNode(codePath);
string href = codeNode.Attributes["href"].Value;
if (!string.IsNullOrWhiteSpace(href))
{
href = href.Replace(";", "&");
}
string code = string.Empty;
if (href != null && href.IndexOf("st=") != -1)
{
href = href.Replace(";", "&");
code = href.Substring(href.IndexOf("st=") + 3, 4);
}
TencentCategoryEntity category = new TencentCategoryEntity()
{
Id = _Count++,
State = 1,
CategoryLevel = 2,
Code = code,
ParentCode = parentCode
};
category.Name = codeNode.InnerText;
category.Url = href;
categoryList.Add(category);
if (codeNode.InnerText != "全部")
{
categoryList.AddRange(this.Third($"{Constant.TencentClassUrl}{href}&tuin=7e4f8b7d", code));
}
}
return categoryList;
}
private List<TencentCategoryEntity> Third(string url, string parentCode)
{
string html = HttpHelper.DownloadUrl(url);
List<TencentCategoryEntity> categoryList = new List<TencentCategoryEntity>();
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
string path = "//*[@id='auto-test-1']/div[1]/dl/dd";
HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);
if (nodeList == null)
{
}
foreach (HtmlNode node in nodeList)
{
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(node.InnerHtml);
string codePath = "//a";
HtmlNode codeNode = htmlDocument.DocumentNode.SelectSingleNode(codePath);
string href = codeNode.Attributes["href"].Value;
string code = string.Empty;
if (href != null)
{
href = href.Replace(";", "&");
}
if (href != null && href.IndexOf("tt=") != -1)
{
href = href.Replace(";", "&");
code = href.Substring(href.IndexOf("tt=") + 3, 4);
}
TencentCategoryEntity category = new TencentCategoryEntity()
{
Id = _Count++,
State = 1,
CategoryLevel = 3,
Code = code,
ParentCode = parentCode
};
category.Name = codeNode.InnerText;
category.Url = href;
categoryList.Add(category);
}
return categoryList;
}
}