Skip to content

Commit 0fc7b5f

Browse files
committed
Remove confusing summary counts
The counts at the end don't tally up because collections can contain collections. They've been removed. People wanting to count things are encouraged to use the CSV and JSON outputs.
1 parent 8df2d0d commit 0fc7b5f

File tree

3 files changed

+28
-26
lines changed

3 files changed

+28
-26
lines changed

README.md

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,21 @@ To check a particular URL here's how it works:
2727

2828
```shell
2929
waybackprov https://twitter.com/EPAScottPruitt
30-
364 https://archive.org/details/focused_crawls
31-
306 https://archive.org/details/edgi_monitor
32-
151 https://archive.org/details/www3.epa.gov
33-
60 https://archive.org/details/epa.gov4
34-
47 https://archive.org/details/epa.gov5
35-
...
30+
31+
crawls collections
32+
364 https://archive.org/details/focused_crawls
33+
306 https://archive.org/details/edgi_monitor
34+
151 https://archive.org/details/www3.epa.gov
35+
60 https://archive.org/details/epa.gov4
36+
47 https://archive.org/details/epa.gov5
3637
```
3738

3839
The first column contains the number of crawls for a particular URL, and the
3940
second column contains the URL for the Internet Archive collection that added
4041
it.
4142

43+
When evaluating the counts it's important to remember that collections can be contained in other collections. So `epa.gov4` in the example above is part of the `edgi_monitor` collection.
44+
4245
## Time
4346

4447
By default waybackprov will only look at the current year. If you would like it

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "waybackprov"
3-
version = "0.1.0"
3+
version = "0.1.1"
44
description = "Checks the provenance of a URL in the Wayback machine"
55
readme = "README.md"
66
authors = [

src/waybackprov/__init__.py

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,10 @@
77
import time
88
import codecs
99
import logging
10-
import operator
1110
import datetime
1211
import optparse
1312
import collections
1413

15-
from functools import reduce
1614
from urllib.parse import quote
1715
from urllib.request import urlopen
1816

@@ -61,12 +59,19 @@ def main():
6159
)
6260

6361
if opts.format == "text":
64-
crawls = 0
62+
# coll_urls is a dictionary where the key is a collection id and the
63+
# value is a set of URLs that have been crawled
6564
coll_urls = {}
65+
66+
# coll_counter is a Counter that counts the number of crawls that are
67+
# in a collection
6668
coll_counter = collections.Counter()
69+
6770
for crawl in crawl_data:
68-
crawls += 1
6971
coll_counter.update(crawl["collections"])
72+
73+
# a crawl can appear in multiple collections because of how
74+
# collections can contain other collections
7075
for coll in crawl["collections"]:
7176
# keep track of urls in each collection
7277
if coll not in coll_urls:
@@ -80,25 +85,19 @@ def main():
8085
)
8186
return
8287

83-
max_pos = str(len(str(coll_counter.most_common(1)[0][1])))
8488
if opts.prefix:
85-
str_format = (
86-
"%" + max_pos + "i %" + max_pos + "i https://archive.org/details/%s"
87-
)
89+
str_format = "%6s %6s %s"
90+
print(str_format % ("crawls", "urls", "collection"))
8891
else:
89-
str_format = "%" + max_pos + "i https://archive.org/details/%s"
92+
str_format = "%6s %s"
93+
print(str_format % ("crawls", "collection"))
9094

9195
for coll_id, count in coll_counter.most_common():
96+
coll_url = f"https://archive.org/details/{coll_id}"
9297
if opts.prefix:
93-
print(str_format % (count, len(coll_urls[coll_id]), coll_id))
98+
print(str_format % (count, len(coll_urls[coll_id]), coll_url))
9499
else:
95-
print(str_format % (count, coll_id))
96-
97-
print("")
98-
print("total crawls %s-%s: %s" % (opts.start, opts.end, crawls))
99-
if opts.prefix:
100-
total_urls = len(reduce(operator.or_, coll_urls.values()))
101-
print("total urls: %s" % total_urls)
100+
print(str_format % (count, coll_url))
102101

103102
elif opts.format == "json":
104103
data = list(crawl_data)
@@ -227,8 +226,8 @@ def get_json(url):
227226
reader = codecs.getreader("utf-8")
228227
return json.load(reader(resp))
229228
except Exception as e:
230-
logging.error("caught exception: %s", e)
231-
logging.info("sleeping for %s seconds", count * 10)
229+
logging.debug("caught exception: %s", e)
230+
logging.debug("sleeping for %s seconds", count * 10)
232231
time.sleep(count * 10)
233232
raise (Exception("unable to get JSON for %s", url))
234233

0 commit comments

Comments
 (0)