Skip to content

Commit c1391ee

Browse files
authored
Merge pull request #7 from Linusp/dev
new command `dedupe`
2 parents a57f8d8 + 36b4e9b commit c1391ee

File tree

4 files changed

+265
-19
lines changed

4 files changed

+265
-19
lines changed

CHANGELOG.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,24 @@
11
# CHANGELOG
22

3+
## v0.3.0
4+
5+
Added
6+
7+
- New Class: `Subscription` in `inoreader.subscription`
8+
- New methods:
9+
- `InoreaderClient.get_subscription_list`
10+
- `InoreaderClient.get_stream_contents`
11+
12+
- New commands: `get-subscriptions`, `fetch-articles`, `dedupe`
13+
14+
15+
Changed
16+
17+
- Supported new output formats in command `fetch-unread`: `markdown` and `org-mode`
18+
- Changed command `filter`, see `example/rules.example.yaml` for details
19+
- Use `logging` instead of `print` in cli
20+
21+
322
## v0.2.1
423

524
Changed

inoreader/main.py

Lines changed: 87 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,16 @@
66
import sys
77
import json
88
import codecs
9-
from datetime import datetime
10-
from collections import defaultdict
9+
import logging
10+
from logging.config import dictConfig
11+
from collections import defaultdict, Counter
1112
from configparser import ConfigParser
1213

1314
import yaml
1415
import click
1516
from inoreader import InoreaderClient
1617
from inoreader.filter import get_filter
18+
from inoreader.sim import sim_of, InvIndex
1719

1820

1921
APPID_ENV_NAME = 'INOREADER_APP_ID'
@@ -22,6 +24,37 @@
2224
ENV_NAMES = [APPID_ENV_NAME, APPKEY_ENV_NAME, TOKEN_ENV_NAME]
2325

2426
CONFIG_FILE = os.path.join(os.environ.get('HOME'), '.inoreader')
27+
LOGGER = logging.getLogger(__name__)
28+
29+
30+
dictConfig({
31+
'version': 1,
32+
'formatters': {
33+
'simple': {
34+
'format': '%(asctime)s - %(filename)s:%(lineno)s: %(message)s',
35+
}
36+
},
37+
'handlers': {
38+
'default': {
39+
'level': 'DEBUG',
40+
'class': 'logging.StreamHandler',
41+
'formatter': 'simple',
42+
"stream": "ext://sys.stdout",
43+
},
44+
},
45+
'loggers': {
46+
'__main__': {
47+
'handlers': ['default'],
48+
'level': 'DEBUG',
49+
'propagate': False
50+
},
51+
'inoreader': {
52+
'handlers': ['default'],
53+
'level': 'DEBUG',
54+
'propagate': True
55+
},
56+
}
57+
})
2558

2659

2760
def read_config():
@@ -48,15 +81,15 @@ def get_client():
4881
config = read_config()
4982
appid, appkey = get_appid_key(config)
5083
if not appid or not appkey:
51-
print("'appid' or 'appkey' is missing")
84+
LOGGER.error("'appid' or 'appkey' is missing")
5285
sys.exit(1)
5386

5487
token = None
5588
if config.has_section('auth'):
5689
token = config.get('auth', 'token')
5790
token = token or os.environ.get(TOKEN_ENV_NAME)
5891
if not token:
59-
print("Please login first")
92+
LOGGER.error("Please login first")
6093
sys.exit(1)
6194

6295
userid = None
@@ -81,7 +114,7 @@ def login():
81114
password = input("PASSWORD: ").strip()
82115
status = client.login(username, password)
83116
if status:
84-
print("Login as '{}'".format(username))
117+
LOGGER.info("Login as '%s'", username)
85118
auth_token = client.auth_token
86119
config = read_config()
87120
if 'auth' in config:
@@ -94,9 +127,9 @@ def login():
94127
config['user'] = {'email': username, 'id': client.userinfo()['userId']}
95128
with codecs.open(CONFIG_FILE, mode='w', encoding='utf-8') as fconfig:
96129
config.write(fconfig)
97-
print("save token in {}, ".format(username, CONFIG_FILE))
130+
LOGGER.info("save token in config file '%s'", CONFIG_FILE)
98131
else:
99-
print("Login failed: Wrong username or password")
132+
LOGGER.info("Login failed: Wrong username or password")
100133
sys.exit(1)
101134

102135

@@ -136,7 +169,7 @@ def fetch_unread(folder, tags, outfile, out_format):
136169
writer = csv.writer(fout, delimiter=',') if out_format == 'csv' else None
137170
for idx, article in enumerate(client.fetch_unread(folder=folder, tags=tag_list)):
138171
if idx > 0 and (idx % 10) == 0:
139-
print("[{}] fetched {} articles".format(datetime.now(), idx))
172+
LOGGER.info("fetched %d articles", idx)
140173
title = article.title
141174
text = article.text
142175
if out_format == 'json':
@@ -154,7 +187,7 @@ def fetch_unread(folder, tags, outfile, out_format):
154187
print('* {}\n'.format(title), file=fout)
155188
print(text + '\n', file=fout)
156189

157-
print("[{}] fetched {} articles and saved them in {}".format(datetime.now(), idx + 1, outfile))
190+
LOGGER.info("fetched %d articles and saved them in %s", idx + 1, outfile)
158191

159192
fout.close()
160193

@@ -165,23 +198,23 @@ def apply_action(articles, client, action, tags):
165198
client.add_tag(articles, tag)
166199

167200
for article in articles:
168-
print("Add tags [{}] on article: {}".format(tags, article.title))
201+
LOGGER.info("Add tags [%s] on article: %s", tags, article.title)
169202
elif action == 'mark_as_read':
170203
client.mark_as_read(articles)
171204
for article in articles:
172-
print("Mark article as read: {}".format(article.title))
205+
LOGGER.info("Mark article as read: %s", article.title)
173206
elif action == 'like':
174207
client.mark_as_liked(articles)
175208
for article in articles:
176-
print("Mark article as liked: {}".format(article.title))
209+
LOGGER.info("Mark article as liked: %s", article.title)
177210
elif action == 'broadcast':
178211
client.broadcast(articles)
179212
for article in articles:
180-
print("Boradcast article: {}".format(article.title))
213+
LOGGER.info("Boradcast article: {}", article.title)
181214
elif action == 'star':
182215
client.mark_as_starred(articles)
183216
for article in articles:
184-
print("Starred article: {}".format(article.title))
217+
LOGGER.info("Starred article: {}", article.title)
185218

186219

187220
@main.command("filter")
@@ -241,8 +274,11 @@ def filter_articles(rules_file):
241274
matched_articles[action['type']].append((article, action))
242275

243276
count += 1
244-
print("[{}] matched {} articles with filter: {}".format(
245-
datetime.now(), count, rule['name']))
277+
278+
LOGGER.info(
279+
"matched %d articles in folder(s) %s with filter named '%s'",
280+
count, rule['folders'], rule['name']
281+
)
246282

247283
for action_name in matched_articles:
248284
articles, actions = zip(*matched_articles[action_name])
@@ -308,7 +344,7 @@ def fetch_articles(outfile, stream_id, out_format):
308344

309345
for idx, article in enumerate(client.get_stream_contents(stream_id)):
310346
if idx > 0 and (idx % 10) == 0:
311-
print("[{}] fetched {} articles".format(datetime.now(), idx))
347+
LOGGER.info("fetched %d articles", idx)
312348

313349
title = article.title
314350
text = article.text
@@ -327,10 +363,43 @@ def fetch_articles(outfile, stream_id, out_format):
327363
print('* {}\n'.format(title), file=fout)
328364
print(text + '\n', file=fout)
329365

330-
print("[{}] fetched {} articles and saved them in {}".format(datetime.now(), idx + 1, outfile))
366+
LOGGER.info("fetched %d articles and saved them in %s", idx + 1, outfile)
331367

332368
fout.close()
333369

334370

371+
@main.command()
372+
@click.option("-f", "--folder", help="Folder you want to deduplicate")
373+
@click.option("-t", "--thresh", type=float, default=0.8,
374+
help="Minimum similarity score")
375+
def dedupe(folder, thresh):
376+
"""Deduplicate articles"""
377+
client = get_client()
378+
matched_articles, index = [], InvIndex()
379+
for idx, article in enumerate(client.fetch_unread(folder=folder)):
380+
if idx > 0 and (idx % 10) == 0:
381+
LOGGER.info("fetched %d articles and found %d duplicate", idx, len(matched_articles))
382+
383+
related = index.retrieve(article.title, k=10)
384+
sims = Counter()
385+
for docid, doc, _ in related:
386+
if docid == article.id:
387+
continue
388+
sims[doc] = sim_of(doc, article.title, method='cosine', term='char', ngram_range=(2, 3))
389+
390+
if sims and max(sims.values()) >= thresh:
391+
top_doc, top_score = sims.most_common()[0]
392+
print("article 「{}」 is duplicate with -> 「{}」".format(
393+
article.title, top_doc
394+
))
395+
matched_articles.append(article)
396+
continue
397+
398+
index.add_doc(article)
399+
400+
LOGGER.info("fetched %d articles and found %d duplicate", idx + 1, len(matched_articles))
401+
apply_action(matched_articles, client, 'mark_as_read', None)
402+
403+
335404
if __name__ == '__main__':
336405
main()

inoreader/sim.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
import re
2+
import pickle
3+
from math import sqrt
4+
from collections import Counter
5+
from difflib import SequenceMatcher
6+
from collections import defaultdict
7+
8+
PUNCTS_PAT = re.compile(
9+
r'(?:[#\$&@.,;:!?,。!?、:;  \u3300\'`"~_\+\-\*\/\\|\\^=<>\[\]\(\)\{\}()“”‘’\s]|'
10+
r'[\u2000-\u206f]|'
11+
r'[\u3000-\u303f]|'
12+
r'[\uff30-\uff4f]|'
13+
r'[\uff00-\uff0f\uff1a-\uff20\uff3b-\uff40\uff5b-\uff65])+'
14+
)
15+
16+
17+
def make_terms(text, term, ngram_range=None, lower=True, ignore_punct=True, gram_as_tuple=False):
18+
if lower:
19+
text = text.lower()
20+
if term == 'word':
21+
# term_seq = [word.strip() for word in jieba.cut(text) if word.strip()]
22+
term_seq = [word.strip() for word in text.split() if word.strip()]
23+
elif term == 'char':
24+
term_seq = list(re.sub(r'\s', '', text))
25+
else:
26+
raise ValueError("unsupported term type: {}".foramt(term))
27+
28+
if ngram_range and not (len(ngram_range) == 2 and ngram_range[0] < ngram_range[1]):
29+
raise ValueError("wrong `ngram_range`: {}".foramt(ngram_range))
30+
31+
terms = []
32+
min_ngram, max_ngram = ngram_range or (1, 2)
33+
for idx in range(0, max(1, len(term_seq) - min_ngram + 1)):
34+
cur_grams = []
35+
for gram_level in range(min_ngram, max_ngram):
36+
if gram_as_tuple:
37+
gram = tuple(term_seq[idx:idx + gram_level])
38+
else:
39+
gram = ''.join(term_seq[idx:idx + gram_level])
40+
if gram not in cur_grams:
41+
if ignore_punct and any(PUNCTS_PAT.match(item) for item in gram):
42+
pass
43+
else:
44+
cur_grams.append(gram)
45+
terms.extend(cur_grams)
46+
return terms
47+
48+
49+
def lcs_sim(s1, s2, term='char', ngram_range=None, ngram_weights=None,
50+
lower=True, ignore_punct=True):
51+
s1_terms = make_terms(s1, 'char', None, lower, ignore_punct)
52+
s2_terms = make_terms(s2, 'char', None, lower, ignore_punct)
53+
return SequenceMatcher(a=s1_terms, b=s2_terms).ratio()
54+
55+
56+
def jaccard_sim(s1, s2, term='word', ngram_range=None, ngram_weights=None,
57+
lower=True, ignore_punct=True):
58+
if not ngram_range or ngram_range[1] == ngram_range[0] + 1:
59+
first_term_set = set(make_terms(s1, term, ngram_range, lower, ignore_punct))
60+
second_term_set = set(make_terms(s2, term, ngram_range, lower, ignore_punct))
61+
if not first_term_set and not second_term_set:
62+
return 1.0
63+
return len(first_term_set & second_term_set) / len(first_term_set | second_term_set)
64+
else:
65+
weights = ngram_weights or list(range(*ngram_range))
66+
weights_sum = sum(weights)
67+
weights = [weight / weights_sum for weight in weights]
68+
scores = []
69+
for ngram_level in range(*ngram_range):
70+
score = jaccard_sim(s1, s2, term=term,
71+
ngram_range=(ngram_level, ngram_level + 1),
72+
lower=lower, ignore_punct=ignore_punct)
73+
scores.append(score)
74+
75+
return sum([score * weight for score, weight in zip(scores, weights)])
76+
77+
78+
def cosine_sim(s1, s2, term='word', ngram_range=None, ngram_weights=None,
79+
lower=True, ignore_punct=True):
80+
if not ngram_range or ngram_range[1] == ngram_range[0] + 1:
81+
first_term_freq = Counter(make_terms(s1, term, ngram_range, lower, ignore_punct))
82+
second_term_freq = Counter(make_terms(s2, term, ngram_range, lower, ignore_punct))
83+
84+
first_norm = 0
85+
second_norm = 0
86+
inner_product = 0
87+
88+
for term, freq in first_term_freq.items():
89+
first_norm += freq ** 2
90+
inner_product += freq * second_term_freq[term]
91+
92+
for term, freq in second_term_freq.items():
93+
second_norm += freq ** 2
94+
95+
if first_norm == 0 and second_norm == 0:
96+
return 1.0
97+
if first_norm == 0 or second_norm == 0:
98+
return 0.0
99+
100+
return inner_product / sqrt(first_norm * second_norm)
101+
else:
102+
weights = ngram_weights or list(range(*ngram_range))
103+
weights_sum = sum(weights)
104+
weights = [weight / weights_sum for weight in weights]
105+
scores = []
106+
for ngram_level in range(*ngram_range):
107+
score = cosine_sim(s1, s2, term=term,
108+
ngram_range=(ngram_level, ngram_level + 1),
109+
lower=lower, ignore_punct=ignore_punct)
110+
scores.append(score)
111+
112+
return sum([score * weight for score, weight in zip(scores, weights)])
113+
114+
115+
def sim_of(s1, s2, method='cosine', term='word', ngram_range=None, lower=True, ignore_punct=True):
116+
method_func = {
117+
'lcs': lcs_sim,
118+
'jaccard': jaccard_sim,
119+
'cosine': cosine_sim,
120+
}.get(method)
121+
if not method_func:
122+
raise ValueError("unsupported method: {}".format(method))
123+
124+
return method_func(s1, s2, term=term, ngram_range=ngram_range,
125+
lower=lower, ignore_punct=ignore_punct)
126+
127+
128+
class InvIndex(object):
129+
def __init__(self):
130+
"""build inverted index with ngram method"""
131+
self._id2doc = {}
132+
self._index = defaultdict(set)
133+
134+
def add_doc(self, doc):
135+
if doc.id in self._id2doc:
136+
return False
137+
138+
self._id2doc[doc.id] = doc.title
139+
terms = set(make_terms(doc.title, 'char', (3, 4)))
140+
for term in terms:
141+
self._index[term].add(doc.id)
142+
143+
return True
144+
145+
def retrieve(self, query, k=10):
146+
related = Counter()
147+
terms = set(make_terms(query, 'char', (3, 4)))
148+
for term in terms:
149+
for qid in self._index.get(term, []):
150+
related[qid] += 1
151+
152+
return [(idx, self._id2doc[idx], score) for idx, score in related.most_common(k)]
153+
154+
def save(self, fname):
155+
pickle.dump((self._id2doc, self._index), open(fname, 'wb'))
156+
157+
def load(self, fname):
158+
self._id2doc, self._index = pickle.load(open(fname, 'rb'))

0 commit comments

Comments
 (0)