|
7 | 7 | import json |
8 | 8 | import codecs |
9 | 9 | from datetime import datetime |
10 | | -from collections import defaultdict |
| 10 | +from collections import defaultdict, Counter |
11 | 11 | from configparser import ConfigParser |
12 | 12 |
|
13 | 13 | import yaml |
14 | 14 | import click |
15 | 15 | from inoreader import InoreaderClient |
16 | 16 | from inoreader.filter import get_filter |
| 17 | +from inoreader.sim import sim_of, InvIndex |
17 | 18 |
|
18 | 19 |
|
19 | 20 | APPID_ENV_NAME = 'INOREADER_APP_ID' |
@@ -332,5 +333,34 @@ def fetch_articles(outfile, stream_id, out_format): |
332 | 333 | fout.close() |
333 | 334 |
|
334 | 335 |
|
| 336 | +@main.command() |
| 337 | +@click.option("-f", "--folder", help="Folder you want to deduplicate") |
| 338 | +@click.option("-t", "--thresh", type=float, default=0.8, |
| 339 | + help="Minimum similarity score") |
| 340 | +def dedupe(folder, thresh): |
| 341 | + """Deduplicate articles""" |
| 342 | + client = get_client() |
| 343 | + matched_articles, index = [], InvIndex() |
| 344 | + for idx, article in enumerate(client.fetch_unread(folder=folder)): |
| 345 | + related = index.retrieve(article.title, k=10) |
| 346 | + sims = Counter() |
| 347 | + for docid, doc, _ in related: |
| 348 | + if docid == article.id: |
| 349 | + continue |
| 350 | + sims[doc] = sim_of(doc, article.title, method='cosine', term='char', ngram_range=(2, 3)) |
| 351 | + |
| 352 | + if sims and max(sims.values()) >= thresh: |
| 353 | + top_doc, top_score = sims.most_common()[0] |
| 354 | + print("article 「{}」 is duplicate with -> 「{}」".format( |
| 355 | + article.title, top_doc |
| 356 | + )) |
| 357 | + matched_articles.append(article) |
| 358 | + continue |
| 359 | + |
| 360 | + index.add_doc(article) |
| 361 | + |
| 362 | + apply_action(matched_articles, client, 'mark_as_read', None) |
| 363 | + |
| 364 | + |
335 | 365 | if __name__ == '__main__': |
336 | 366 | main() |
0 commit comments