Skip to content

Commit fcb6f7d

Browse files
committed
add command dedupe
1 parent 92c4a04 commit fcb6f7d

File tree

1 file changed

+31
-1
lines changed

1 file changed

+31
-1
lines changed

inoreader/main.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@
77
import json
88
import codecs
99
from datetime import datetime
10-
from collections import defaultdict
10+
from collections import defaultdict, Counter
1111
from configparser import ConfigParser
1212

1313
import yaml
1414
import click
1515
from inoreader import InoreaderClient
1616
from inoreader.filter import get_filter
17+
from inoreader.sim import sim_of, InvIndex
1718

1819

1920
APPID_ENV_NAME = 'INOREADER_APP_ID'
@@ -332,5 +333,34 @@ def fetch_articles(outfile, stream_id, out_format):
332333
fout.close()
333334

334335

336+
@main.command()
337+
@click.option("-f", "--folder", help="Folder you want to deduplicate")
338+
@click.option("-t", "--thresh", type=float, default=0.8,
339+
help="Minimum similarity score")
340+
def dedupe(folder, thresh):
341+
"""Deduplicate articles"""
342+
client = get_client()
343+
matched_articles, index = [], InvIndex()
344+
for idx, article in enumerate(client.fetch_unread(folder=folder)):
345+
related = index.retrieve(article.title, k=10)
346+
sims = Counter()
347+
for docid, doc, _ in related:
348+
if docid == article.id:
349+
continue
350+
sims[doc] = sim_of(doc, article.title, method='cosine', term='char', ngram_range=(2, 3))
351+
352+
if sims and max(sims.values()) >= thresh:
353+
top_doc, top_score = sims.most_common()[0]
354+
print("article 「{}」 is duplicate with -> 「{}」".format(
355+
article.title, top_doc
356+
))
357+
matched_articles.append(article)
358+
continue
359+
360+
index.add_doc(article)
361+
362+
apply_action(matched_articles, client, 'mark_as_read', None)
363+
364+
335365
if __name__ == '__main__':
336366
main()

0 commit comments

Comments
 (0)