Skip to content

Commit fbe0ca4

Browse files
authored
chore: Update plots for Nov 2025 crawl (CC-MAIN-2025-47) (#23)
* Update plots for Nov 2025 crawl (CC-MAIN-2025-47) Signed-off-by: malteos <git@i.mieo.de> * Adding temp files to .gitignore Signed-off-by: malteos <git@i.mieo.de> --------- Signed-off-by: malteos <git@i.mieo.de>
1 parent 1480a95 commit fbe0ca4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+5360
-4874
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ themes/
109109

110110
# crawl statistics files
111111
stats/*.gz
112+
stats/crawls.txt
113+
stats/excerpt/
112114

113115
# generated CSV data
114116
data/

_config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
title: Statistics of Common Crawl Monthly Archives
22
description: Number of pages, distribution of top-level domains, crawl overlaps, etc. - basic metrics about Common Crawl Monthly Crawl Archives
33
repository: commoncrawl/cc-crawl-statistics
4-
latest_crawl: CC-MAIN-2025-43
4+
latest_crawl: CC-MAIN-2025-47
55

66
show_navigation: True
77
navlist:

plots/charsets-top-100.html

Lines changed: 46 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
<thead>
33
<tr style="text-align: right;">
44
<th>crawl</th>
5-
<th>CC-MAIN-2025-33</th>
65
<th>CC-MAIN-2025-38</th>
76
<th>CC-MAIN-2025-43</th>
7+
<th>CC-MAIN-2025-47</th>
88
</tr>
99
<tr>
1010
<th>charset</th>
@@ -18,97 +18,97 @@
1818
<th>&lt;other&gt;</th>
1919
<td>0.0000</td>
2020
<td>0.0000</td>
21-
<td>0.0000</td>
21+
<td>0.0001</td>
2222
</tr>
2323
<tr>
2424
<th>&lt;unknown&gt;</th>
25-
<td>1.5070</td>
2625
<td>1.4892</td>
2726
<td>1.4313</td>
27+
<td>1.7191</td>
2828
</tr>
2929
<tr>
3030
<th>Big5</th>
31-
<td>0.0445</td>
3231
<td>0.0467</td>
3332
<td>0.0430</td>
33+
<td>0.0464</td>
3434
</tr>
3535
<tr>
3636
<th>Big5-HKSCS</th>
3737
<td>0.0001</td>
3838
<td>0.0001</td>
39-
<td>0.0001</td>
39+
<td>0.0000</td>
4040
</tr>
4141
<tr>
4242
<th>EUC-JP</th>
43-
<td>0.1225</td>
4443
<td>0.1254</td>
4544
<td>0.1202</td>
45+
<td>0.1345</td>
4646
</tr>
4747
<tr>
4848
<th>EUC-KR</th>
49-
<td>0.0787</td>
5049
<td>0.0777</td>
5150
<td>0.0702</td>
51+
<td>0.0783</td>
5252
</tr>
5353
<tr>
5454
<th>GB18030</th>
55-
<td>0.0135</td>
5655
<td>0.0146</td>
5756
<td>0.0130</td>
57+
<td>0.0160</td>
5858
</tr>
5959
<tr>
6060
<th>GB2312</th>
61-
<td>0.1913</td>
6261
<td>0.1972</td>
6362
<td>0.2100</td>
63+
<td>0.2491</td>
6464
</tr>
6565
<tr>
6666
<th>GBK</th>
67-
<td>0.0963</td>
6867
<td>0.1001</td>
6968
<td>0.0968</td>
69+
<td>0.1042</td>
7070
</tr>
7171
<tr>
7272
<th>IBM420</th>
73-
<td>0.0036</td>
7473
<td>0.0034</td>
7574
<td>0.0033</td>
75+
<td>0.0038</td>
7676
</tr>
7777
<tr>
7878
<th>IBM424</th>
7979
<td>0.0010</td>
80-
<td>0.0010</td>
8180
<td>0.0015</td>
81+
<td>0.0014</td>
8282
</tr>
8383
<tr>
8484
<th>IBM500</th>
8585
<td>0.0008</td>
86-
<td>0.0008</td>
8786
<td>0.0010</td>
87+
<td>0.0012</td>
8888
</tr>
8989
<tr>
9090
<th>IBM855</th>
9191
<td>0.0000</td>
9292
<td>0.0000</td>
93-
<td>0.0000</td>
93+
<td>NaN</td>
9494
</tr>
9595
<tr>
9696
<th>IBM866</th>
97-
<td>0.0002</td>
9897
<td>0.0003</td>
9998
<td>0.0002</td>
99+
<td>0.0002</td>
100100
</tr>
101101
<tr>
102102
<th>ISO-2022-JP</th>
103-
<td>0.0010</td>
104103
<td>0.0008</td>
105104
<td>0.0009</td>
105+
<td>0.0011</td>
106106
</tr>
107107
<tr>
108108
<th>ISO-8859-1</th>
109-
<td>5.2115</td>
110109
<td>5.6088</td>
111110
<td>5.4660</td>
111+
<td>5.7471</td>
112112
</tr>
113113
<tr>
114114
<th>ISO-8859-13</th>
@@ -118,39 +118,39 @@
118118
</tr>
119119
<tr>
120120
<th>ISO-8859-15</th>
121-
<td>0.0473</td>
122121
<td>0.0444</td>
123122
<td>0.0403</td>
123+
<td>0.0449</td>
124124
</tr>
125125
<tr>
126126
<th>ISO-8859-16</th>
127-
<td>0.0001</td>
127+
<td>0.0002</td>
128128
<td>0.0002</td>
129129
<td>0.0002</td>
130130
</tr>
131131
<tr>
132132
<th>ISO-8859-2</th>
133-
<td>0.0863</td>
134133
<td>0.0788</td>
135134
<td>0.0811</td>
135+
<td>0.0888</td>
136136
</tr>
137137
<tr>
138138
<th>ISO-8859-3</th>
139-
<td>0.0002</td>
140139
<td>0.0003</td>
141140
<td>0.0003</td>
141+
<td>0.0004</td>
142142
</tr>
143143
<tr>
144144
<th>ISO-8859-4</th>
145-
<td>0.0005</td>
146145
<td>0.0004</td>
147146
<td>0.0006</td>
147+
<td>0.0007</td>
148148
</tr>
149149
<tr>
150150
<th>ISO-8859-5</th>
151151
<td>0.0015</td>
152-
<td>0.0015</td>
153152
<td>0.0010</td>
153+
<td>0.0012</td>
154154
</tr>
155155
<tr>
156156
<th>ISO-8859-6</th>
@@ -160,27 +160,27 @@
160160
</tr>
161161
<tr>
162162
<th>ISO-8859-7</th>
163-
<td>0.0049</td>
164163
<td>0.0044</td>
165164
<td>0.0044</td>
165+
<td>0.0045</td>
166166
</tr>
167167
<tr>
168168
<th>ISO-8859-8</th>
169-
<td>0.0007</td>
170169
<td>0.0006</td>
171170
<td>0.0006</td>
171+
<td>0.0007</td>
172172
</tr>
173173
<tr>
174174
<th>ISO-8859-9</th>
175-
<td>0.0200</td>
176175
<td>0.0231</td>
177176
<td>0.0212</td>
177+
<td>0.0219</td>
178178
</tr>
179179
<tr>
180180
<th>KOI8-R</th>
181-
<td>0.0068</td>
182181
<td>0.0061</td>
183182
<td>0.0065</td>
183+
<td>0.0070</td>
184184
</tr>
185185
<tr>
186186
<th>KOI8-U</th>
@@ -190,105 +190,105 @@
190190
</tr>
191191
<tr>
192192
<th>Shift_JIS</th>
193-
<td>0.1573</td>
194193
<td>0.1568</td>
195194
<td>0.1460</td>
195+
<td>0.1550</td>
196196
</tr>
197197
<tr>
198198
<th>TIS-620</th>
199-
<td>0.0048</td>
200199
<td>0.0053</td>
201200
<td>0.0047</td>
201+
<td>0.0041</td>
202202
</tr>
203203
<tr>
204204
<th>US-ASCII</th>
205-
<td>0.0201</td>
206205
<td>0.0192</td>
207206
<td>0.0153</td>
207+
<td>0.0184</td>
208208
</tr>
209209
<tr>
210210
<th>UTF-16</th>
211-
<td>0.0047</td>
212211
<td>0.0044</td>
213212
<td>0.0043</td>
213+
<td>0.0048</td>
214214
</tr>
215215
<tr>
216216
<th>UTF-16BE</th>
217217
<td>0.0002</td>
218218
<td>0.0002</td>
219-
<td>0.0002</td>
219+
<td>0.0004</td>
220220
</tr>
221221
<tr>
222222
<th>UTF-16LE</th>
223223
<td>0.0010</td>
224-
<td>0.0010</td>
225224
<td>0.0009</td>
225+
<td>0.0011</td>
226226
</tr>
227227
<tr>
228228
<th>UTF-32</th>
229229
<td>0.0000</td>
230230
<td>0.0000</td>
231-
<td>0.0000</td>
231+
<td>0.0001</td>
232232
</tr>
233233
<tr>
234234
<th>UTF-32LE</th>
235235
<td>0.0003</td>
236-
<td>0.0003</td>
236+
<td>0.0002</td>
237237
<td>0.0002</td>
238238
</tr>
239239
<tr>
240240
<th>UTF-8</th>
241-
<td>91.6428</td>
242241
<td>91.2632</td>
243242
<td>91.5255</td>
243+
<td>90.7888</td>
244244
</tr>
245245
<tr>
246246
<th>windows-1250</th>
247-
<td>0.0668</td>
248247
<td>0.0620</td>
249248
<td>0.0655</td>
249+
<td>0.0706</td>
250250
</tr>
251251
<tr>
252252
<th>windows-1251</th>
253-
<td>0.4726</td>
254253
<td>0.4674</td>
255254
<td>0.4347</td>
255+
<td>0.4767</td>
256256
</tr>
257257
<tr>
258258
<th>windows-1252</th>
259-
<td>0.1241</td>
260259
<td>0.1291</td>
261260
<td>0.1281</td>
261+
<td>0.1405</td>
262262
</tr>
263263
<tr>
264264
<th>windows-1253</th>
265-
<td>0.0023</td>
266265
<td>0.0017</td>
267266
<td>0.0019</td>
267+
<td>0.0021</td>
268268
</tr>
269269
<tr>
270270
<th>windows-1254</th>
271-
<td>0.0097</td>
272271
<td>0.0108</td>
273272
<td>0.0102</td>
273+
<td>0.0108</td>
274274
</tr>
275275
<tr>
276276
<th>windows-1255</th>
277-
<td>0.0062</td>
278277
<td>0.0046</td>
279278
<td>0.0060</td>
279+
<td>0.0065</td>
280280
</tr>
281281
<tr>
282282
<th>windows-1256</th>
283-
<td>0.0330</td>
284283
<td>0.0337</td>
285284
<td>0.0297</td>
285+
<td>0.0322</td>
286286
</tr>
287287
<tr>
288288
<th>windows-1257</th>
289289
<td>0.0063</td>
290-
<td>0.0063</td>
291290
<td>0.0059</td>
291+
<td>0.0067</td>
292292
</tr>
293293
<tr>
294294
<th>windows-31j</th>
@@ -298,15 +298,15 @@
298298
</tr>
299299
<tr>
300300
<th>x-iso-8859-11</th>
301-
<td>0.0000</td>
301+
<td>0.0001</td>
302302
<td>0.0001</td>
303303
<td>0.0001</td>
304304
</tr>
305305
<tr>
306306
<th>x-windows-874</th>
307-
<td>0.0068</td>
308307
<td>0.0073</td>
309308
<td>0.0068</td>
309+
<td>0.0075</td>
310310
</tr>
311311
<tr>
312312
<th>x-windows-949</th>

0 commit comments

Comments
 (0)