-
Notifications
You must be signed in to change notification settings - Fork 170
Open
Description
非常感谢开源,测试了几个网页发现一些小问题
注释错误,标题建议匹配H2和H3
另外压缩的网页有换行
H1中的内容有时候并不能被设定为标题
做了小调整
/// <summary> /// 获取网页标题 /// </summary> /// <param name="html">网页代码</param> /// <returns>返回处理过的标题</returns> private static string GetTitle(string html) { string titleFilter = @"<title>[\s\S]*?</title>"; string clearFilter = @"<.*?>"; string title = ""; Match match = Regex.Match(html, titleFilter, RegexOptions.IgnoreCase); if (match.Success) { title = Regex.Replace(match.Groups[0].Value, clearFilter, ""); } // 正文的标题一般在h1中,比title中的标题更干净 for (int i = 1; i < 4; i++) { string h1Filter = @"<h"+i+ ".*?>[\\s\\S]*?</h" + i+">"; MatchCollection mcs = Regex.Matches(html, h1Filter, RegexOptions.IgnoreCase); if (mcs.Count==1) { string h = Regex.Replace(mcs[0].Groups[0].Value, clearFilter, "").Trim(); if (!String.IsNullOrEmpty(h) && title.Trim().Contains(h)) { title = h; } } } return title; }
Metadata
Metadata
Assignees
Labels
No labels