Skip to content

Commit e198413

Browse files
committed
重构HTTPCLIENT POOL
1 parent 6eb044d commit e198413

File tree

6 files changed

+308
-59
lines changed

6 files changed

+308
-59
lines changed

src/DotnetSpider.Core.Test/Downloader/HttpClientDownloaderTest.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ public void Ports()
3030

3131
for (int i = 0; i < 100; i++)
3232
{
33-
downloader.Download(new Request("http://www.163.com", null), spider);
33+
var a = downloader.Download(new Request("http://www.163.com", null), spider).Result;
3434
}
3535
}
3636

@@ -40,12 +40,12 @@ public void DetectDownloadContent()
4040
HttpClientDownloader downloader = new HttpClientDownloader();
4141
DefaultSpider spider = new DefaultSpider("abcd", new Site { });
4242

43-
downloader.Download(new Request("http://www.163.com", null), spider);
43+
var a = downloader.Download(new Request("http://www.163.com", null), spider);
4444
Assert.Equal(Core.Infrastructure.ContentType.Html, spider.Site.ContentType);
4545

4646
HttpClientDownloader2 downloader2 = new HttpClientDownloader2();
4747
DefaultSpider spider2 = new DefaultSpider("abcd", new Site { });
48-
downloader2.Download(new Request("http://www.163.com", null), spider2);
48+
a = downloader2.Download(new Request("http://www.163.com", null), spider2);
4949
Assert.Equal(Core.Infrastructure.ContentType.Json, spider2.Site.ContentType);
5050
}
5151

@@ -67,9 +67,9 @@ public void SetContentType()
6767
}
6868
};
6969
var downloader = new HttpClientDownloader();
70-
downloader.Download(new Request("http://163.com", null), new DefaultSpider("test", site1));
70+
var a = downloader.Download(new Request("http://163.com", null), new DefaultSpider("test", site1));
7171

72-
downloader.Download(new Request("http://163.com", null), new DefaultSpider("test", site2));
72+
a = downloader.Download(new Request("http://163.com", null), new DefaultSpider("test", site2));
7373
}
7474

7575
[Fact(DisplayName = "_404Url")]

src/DotnetSpider.Core.Test/Downloader/HttpClientPoolTest.cs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
using DotnetSpider.Core.Downloader;
22
using DotnetSpider.Core.Proxy;
33
using System;
4+
using System.Collections.Concurrent;
45
using System.Collections.Generic;
56
using System.Net.Http;
67
using System.Text;
@@ -14,16 +15,16 @@ public class HttpClientPoolTest
1415
[Fact(DisplayName = "MultiThreadProxy")]
1516
public void MultiThreadProxy()
1617
{
17-
IHttpClientPool pool = new IHttpClientPool();
18+
IHttpClientPool pool = new HttpClientPool();
1819
var spider = new DefaultSpider();
1920
var downloader = new HttpClientDownloader();
2021

21-
System.Collections.Concurrent.ConcurrentDictionary<HttpClientElement, int> tonggi = new System.Collections.Concurrent.ConcurrentDictionary<HttpClientElement, int>();
22+
ConcurrentDictionary<HttpClientEntry, int> tonggi = new ConcurrentDictionary<HttpClientEntry, int>();
2223
Parallel.For(0, 1000, new ParallelOptions { MaxDegreeOfParallelism = 1 }, (i) =>
2324
{
2425
var port = i % 10;
25-
var proxy = new UseSpecifiedUriWebProxy(new Uri($"http://192.168.10.1:{port}"), null, false);
26-
var item = pool.GetHttpClient(spider, downloader, new System.Net.CookieContainer(), proxy, null);
26+
var proxy = new UseSpecifiedUriWebProxy(new Uri($"http://192.168.10.1:{port}"));
27+
var item = pool.GetHttpClient(proxy.Hash);
2728

2829
if (tonggi.ContainsKey(item))
2930
{

src/DotnetSpider.Core/Downloader/HttpClientDownloader.cs

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
using System.Runtime.CompilerServices;
1212
using System.Net;
1313
using System.Threading.Tasks;
14+
using System.Runtime.Serialization.Formatters.Binary;
1415

1516
namespace DotnetSpider.Core.Downloader
1617
{
@@ -22,6 +23,8 @@ namespace DotnetSpider.Core.Downloader
2223
/// </summary>
2324
public class HttpClientDownloader : BaseDownloader
2425
{
26+
private HashSet<string> _initedCookieContainers = new HashSet<string>();
27+
2528
/// <summary>
2629
/// What mediatype should not be treated as file to download.
2730
/// </summary>
@@ -55,7 +58,7 @@ public class HttpClientDownloader : BaseDownloader
5558
/// <summary xml:lang="zh-CN">
5659
/// HttpClient池
5760
/// </summary>
58-
public static IHttpClientPool HttpClientPool = new IHttpClientPool();
61+
public static IHttpClientPool HttpClientPool = new HttpClientPool();
5962

6063
/// <summary>
6164
/// Constructor
@@ -110,25 +113,23 @@ protected override Task<Page> DowloadContent(Request request, ISpider spider)
110113
{
111114
var httpMessage = GenerateHttpRequestMessage(request, spider.Site);
112115

113-
HttpClientElement httpClientItem;
116+
HttpClientEntry httpClientEntry;
114117
if (spider.Site.HttpProxyPool == null)
115118
{
116119
// Request可以设置不同的DownloaderGroup来使用不同的HttpClient
117-
httpClientItem = HttpClientPool.GetHttpClient(spider, this, CookieContainer, request.DownloaderGroup, CookieInjector);
120+
httpClientEntry = HttpClientPool.GetHttpClient(request.DownloaderGroup);
118121
}
119122
else
120123
{
121124
// TODO: 代理模式下: request.DownloaderGroup 再考虑
122125
var proxy = spider.Site.HttpProxyPool.GetProxy();
123126
request.Proxy = proxy;
124-
httpClientItem = HttpClientPool.GetHttpClient(spider, this, CookieContainer, proxy, CookieInjector);
125-
}
126-
if (!Equals(httpClientItem.Client.Timeout.TotalSeconds, _timeout))
127-
{
128-
httpClientItem.Client.Timeout = new TimeSpan(0, 0, (int)_timeout);
127+
httpClientEntry = HttpClientPool.GetHttpClient(proxy.Hash);
129128
}
130129

131-
response = NetworkCenter.Current.Execute("http", () => httpClientItem.Client.SendAsync(httpMessage).Result);
130+
PrepareHttpClient(httpClientEntry);
131+
132+
response = NetworkCenter.Current.Execute("http", () => httpClientEntry.Client.SendAsync(httpMessage).Result);
132133
request.StatusCode = response.StatusCode;
133134
response.EnsureSuccessStatusCode();
134135

@@ -177,6 +178,28 @@ protected override Task<Page> DowloadContent(Request request, ISpider spider)
177178
}
178179
}
179180

181+
private void PrepareHttpClient(HttpClientEntry httpClientEntry)
182+
{
183+
httpClientEntry.Init(() =>
184+
{
185+
if (!Equals(httpClientEntry.Client.Timeout.TotalSeconds, _timeout))
186+
{
187+
httpClientEntry.Client.Timeout = new TimeSpan(0, 0, (int)_timeout);
188+
}
189+
}, CopyCookieContainer);
190+
}
191+
192+
private CookieContainer CopyCookieContainer()
193+
{
194+
using (MemoryStream stream = new MemoryStream())
195+
{
196+
BinaryFormatter formatter = new BinaryFormatter();
197+
formatter.Serialize(stream, CookieContainer);
198+
stream.Seek(0, SeekOrigin.Begin);
199+
return (CookieContainer)formatter.Deserialize(stream);
200+
}
201+
}
202+
180203
private Page CreateRetryPage(Exception e, Request request, ISpider spider)
181204
{
182205
Page page = spider.Site.CycleRetryTimes > 0 ? spider.Site.AddToCycleRetry(request) : new Page(request);

src/DotnetSpider.Core/Downloader/HttpClientPool.1.cs

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using System;
1+
using DotnetSpider.Core.Infrastructure;
2+
using System;
23
using System.Collections.Concurrent;
34
using System.Collections.Generic;
45
using System.IO;
@@ -18,5 +19,83 @@ namespace DotnetSpider.Core.Downloader
1819
/// </summary>
1920
public class HttpClientPool : IHttpClientPool
2021
{
22+
private AutomicLong _getHttpClientCount = new AutomicLong(0);
23+
private readonly Dictionary<string, HttpClientEntry> _pool = new Dictionary<string, HttpClientEntry>();
24+
25+
/// <summary>
26+
/// Get a <see cref="HttpClientElement"/> from <see cref="IHttpClientPool"/>.
27+
/// Return same <see cref="HttpClientElement"/> instance when <paramref name="hashCode"/> is same.
28+
/// This can ensure some pages have same CookieContainer.
29+
/// </summary>
30+
/// <summary xml:lang="zh-CN">
31+
/// 通过不同的Hash分组, 返回对应的HttpClient
32+
/// 设计初衷: 某些网站会对COOKIE某部分做承上启下的检测, 因此必须保证: www.a.com/keyword=xxxx&amp;page=1 www.a.com/keyword=xxxx&amp;page=2 在同一个HttpClient里访问
33+
/// </summary>
34+
/// <param name="hash">分组的哈希 Hashcode to identify different group.</param>
35+
/// <returns>HttpClientItem</returns>
36+
[MethodImpl(MethodImplOptions.Synchronized)]
37+
public HttpClientEntry GetHttpClient(string hash)
38+
{
39+
if (string.IsNullOrWhiteSpace(hash))
40+
{
41+
hash = string.Empty;
42+
}
43+
_getHttpClientCount.Inc();
44+
45+
if (_getHttpClientCount.Value % 100 == 0)
46+
{
47+
CleanupPool();
48+
}
49+
50+
if (_pool.ContainsKey(hash))
51+
{
52+
_pool[hash].ActiveTime = DateTime.Now;
53+
return _pool[hash];
54+
}
55+
else
56+
{
57+
var item = new HttpClientEntry();
58+
_pool.Add(hash, item);
59+
return item;
60+
}
61+
}
62+
63+
/// <summary>
64+
/// Add cookie to <see cref="IHttpClientPool"/>
65+
/// </summary>
66+
/// <summary xml:lang="zh-CN">
67+
/// 设置 Cookie
68+
/// </summary>
69+
/// <param name="cookie">Cookie</param>
70+
[MethodImpl(MethodImplOptions.Synchronized)]
71+
public void AddCookie(Cookie cookie)
72+
{
73+
foreach (var item in _pool.Values)
74+
{
75+
item.Handler.CookieContainer.Add(cookie);
76+
}
77+
}
78+
79+
private void CleanupPool()
80+
{
81+
List<string> needRemoveEntries = new List<string>();
82+
var now = DateTime.Now;
83+
foreach (var pair in _pool)
84+
{
85+
if ((now - pair.Value.ActiveTime).TotalSeconds > 240)
86+
{
87+
needRemoveEntries.Add(pair.Key);
88+
}
89+
}
90+
91+
foreach (var key in needRemoveEntries)
92+
{
93+
HttpClientEntry item = _pool[key];
94+
if (_pool.Remove(key))
95+
{
96+
item.Client.Dispose();
97+
}
98+
}
99+
}
21100
}
22101
}

src/DotnetSpider.Core/Downloader/IHttpClientPool.cs

Lines changed: 51 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,51 @@
55

66
namespace DotnetSpider.Core.Downloader
77
{
8-
/// <summary>
9-
/// HttpClient Infomations
10-
/// </summary>
11-
/// <summary xml:lang="zh-CN">
12-
/// HttpClient信息封装
13-
/// </summary>
14-
public class HttpClientElement
8+
public class HttpClientEntry
159
{
16-
/// <summary>
17-
/// <see cref="HttpClient"/>
18-
/// </summary>
19-
public HttpClient Client { get; set; }
10+
private bool _inited;
2011

21-
/// <summary>
22-
/// <see cref="HttpClientHandler"/>
23-
/// </summary>
24-
public HttpClientHandler Handler { get; set; }
12+
public DateTime ActiveTime { get; set; }
13+
public HttpClient Client { get; private set; }
2514

26-
/// <summary>
27-
/// The last time this is used.
28-
/// </summary>
29-
/// <summary xml:lang="zh-CN">
30-
/// 上一次使用的时间
31-
/// </summary>
32-
public DateTime LastUsedTime { get; set; }
15+
internal HttpClientHandler Handler { get; private set; }
16+
17+
public HttpClientEntry()
18+
{
19+
Handler = new HttpClientHandler
20+
{
21+
AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip,
22+
UseProxy = true,
23+
UseCookies = true,
24+
AllowAutoRedirect = true,
25+
MaxAutomaticRedirections = 10
26+
};
27+
Client = new HttpClient(Handler);
28+
ActiveTime = DateTime.Now;
29+
}
3330

34-
public override int GetHashCode()
31+
internal CookieContainer CookieContainer
3532
{
36-
return (Client.GetHashCode() + Handler.Proxy.ToString()).GetHashCode();
33+
set
34+
{
35+
if (_inited)
36+
{
37+
return;
38+
}
39+
}
40+
}
41+
42+
[MethodImpl(MethodImplOptions.Synchronized)]
43+
internal void Init(Action configAction, Func<CookieContainer> cookieContainerFactory)
44+
{
45+
if (_inited)
46+
{
47+
return;
48+
}
49+
50+
configAction();
51+
Handler.CookieContainer = cookieContainerFactory();
52+
_inited = true;
3753
}
3854
}
3955

@@ -58,6 +74,16 @@ public interface IHttpClientPool
5874
/// <param name="hash">分组的哈希 Hashcode to identify different group.</param>
5975
/// <returns>HttpClientItem</returns>
6076
[MethodImpl(MethodImplOptions.Synchronized)]
61-
HttpClientElement GetHttpClient(string hash);
77+
HttpClientEntry GetHttpClient(string hash);
78+
79+
/// <summary>
80+
/// Add cookie to <see cref="IHttpClientPool"/>
81+
/// </summary>
82+
/// <summary xml:lang="zh-CN">
83+
/// 更新池中所有HttpClient对象的 Cookie
84+
/// </summary>
85+
/// <param name="cookie">Cookie</param>
86+
[MethodImpl(MethodImplOptions.Synchronized)]
87+
void AddCookie(Cookie cookie);
6288
}
6389
}

0 commit comments

Comments
 (0)