Merge pull request #7 from Linusp/dev

Linusp · web-flow · commit c1391eeab68d · 2019-07-22T08:49:12.000+08:00
new command `dedupe`
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,24 @@
 # CHANGELOG
 
+## v0.3.0
+
+Added
+
+- New Class: `Subscription` in `inoreader.subscription`
+- New methods:
+  - `InoreaderClient.get_subscription_list`
+  - `InoreaderClient.get_stream_contents`
+
+- New commands: `get-subscriptions`, `fetch-articles`, `dedupe`
+
+
+Changed
+
+- Supported new output formats in command `fetch-unread`: `markdown` and `org-mode`
+- Changed command `filter`, see `example/rules.example.yaml` for details
+- Use `logging` instead of `print` in cli
+
+
 ## v0.2.1
 
 Changed
diff --git a/inoreader/main.py b/inoreader/main.py
@@ -6,14 +6,16 @@
 import sys
 import json
 import codecs
-from datetime import datetime
-from collections import defaultdict
+import logging
+from logging.config import dictConfig
+from collections import defaultdict, Counter
 from configparser import ConfigParser
 
 import yaml
 import click
 from inoreader import InoreaderClient
 from inoreader.filter import get_filter
+from inoreader.sim import sim_of, InvIndex
 
 
 APPID_ENV_NAME = 'INOREADER_APP_ID'
@@ -22,6 +24,37 @@
 ENV_NAMES = [APPID_ENV_NAME, APPKEY_ENV_NAME, TOKEN_ENV_NAME]
 
 CONFIG_FILE = os.path.join(os.environ.get('HOME'), '.inoreader')
+LOGGER = logging.getLogger(__name__)
+
+
+dictConfig({
+    'version': 1,
+    'formatters': {
+        'simple': {
+            'format': '%(asctime)s - %(filename)s:%(lineno)s: %(message)s',
+        }
+    },
+    'handlers': {
+        'default': {
+            'level': 'DEBUG',
+            'class': 'logging.StreamHandler',
+            'formatter': 'simple',
+            "stream": "ext://sys.stdout",
+        },
+    },
+    'loggers': {
+        '__main__': {
+            'handlers': ['default'],
+            'level': 'DEBUG',
+            'propagate': False
+        },
+        'inoreader': {
+            'handlers': ['default'],
+            'level': 'DEBUG',
+            'propagate': True
+        },
+    }
+})
 
 
 def read_config():
@@ -48,15 +81,15 @@ def get_client():
     config = read_config()
     appid, appkey = get_appid_key(config)
     if not appid or not appkey:
-        print("'appid' or 'appkey' is missing")
+        LOGGER.error("'appid' or 'appkey' is missing")
         sys.exit(1)
 
     token = None
     if config.has_section('auth'):
         token = config.get('auth', 'token')
         token = token or os.environ.get(TOKEN_ENV_NAME)
     if not token:
-        print("Please login first")
+        LOGGER.error("Please login first")
         sys.exit(1)
 
     userid = None
@@ -81,7 +114,7 @@ def login():
     password = input("PASSWORD: ").strip()
     status = client.login(username, password)
     if status:
-        print("Login as '{}'".format(username))
+        LOGGER.info("Login as '%s'", username)
         auth_token = client.auth_token
         config = read_config()
         if 'auth' in config:
@@ -94,9 +127,9 @@ def login():
         config['user'] = {'email': username, 'id': client.userinfo()['userId']}
         with codecs.open(CONFIG_FILE, mode='w', encoding='utf-8') as fconfig:
             config.write(fconfig)
-        print("save token in {}, ".format(username, CONFIG_FILE))
+        LOGGER.info("save token in config file '%s'", CONFIG_FILE)
     else:
-        print("Login failed: Wrong username or password")
+        LOGGER.info("Login failed: Wrong username or password")
         sys.exit(1)
 
 
@@ -136,7 +169,7 @@ def fetch_unread(folder, tags, outfile, out_format):
     writer = csv.writer(fout, delimiter=',') if out_format == 'csv' else None
     for idx, article in enumerate(client.fetch_unread(folder=folder, tags=tag_list)):
         if idx > 0 and (idx % 10) == 0:
-            print("[{}] fetched {} articles".format(datetime.now(), idx))
+            LOGGER.info("fetched %d articles", idx)
         title = article.title
         text = article.text
         if out_format == 'json':
@@ -154,7 +187,7 @@ def fetch_unread(folder, tags, outfile, out_format):
             print('* {}\n'.format(title), file=fout)
             print(text + '\n', file=fout)
 
-    print("[{}] fetched {} articles and saved them in {}".format(datetime.now(), idx + 1, outfile))
+    LOGGER.info("fetched %d articles and saved them in %s", idx + 1, outfile)
 
     fout.close()
 
@@ -165,23 +198,23 @@ def apply_action(articles, client, action, tags):
             client.add_tag(articles, tag)
 
         for article in articles:
-            print("Add tags [{}] on article: {}".format(tags, article.title))
+            LOGGER.info("Add tags [%s] on article: %s", tags, article.title)
     elif action == 'mark_as_read':
         client.mark_as_read(articles)
         for article in articles:
-            print("Mark article as read: {}".format(article.title))
+            LOGGER.info("Mark article as read: %s", article.title)
     elif action == 'like':
         client.mark_as_liked(articles)
         for article in articles:
-            print("Mark article as liked: {}".format(article.title))
+            LOGGER.info("Mark article as liked: %s", article.title)
     elif action == 'broadcast':
         client.broadcast(articles)
         for article in articles:
-            print("Boradcast article: {}".format(article.title))
+            LOGGER.info("Boradcast article: {}", article.title)
     elif action == 'star':
         client.mark_as_starred(articles)
         for article in articles:
-            print("Starred article: {}".format(article.title))
+            LOGGER.info("Starred article: {}", article.title)
 
 
 @main.command("filter")
@@ -241,8 +274,11 @@ def filter_articles(rules_file):
                     matched_articles[action['type']].append((article, action))
 
                 count += 1
-        print("[{}] matched {} articles with filter: {}".format(
-            datetime.now(), count, rule['name']))
+
+        LOGGER.info(
+            "matched %d articles in folder(s) %s with filter named '%s'",
+            count, rule['folders'], rule['name']
+        )
 
     for action_name in matched_articles:
         articles, actions = zip(*matched_articles[action_name])
@@ -308,7 +344,7 @@ def fetch_articles(outfile, stream_id, out_format):
 
     for idx, article in enumerate(client.get_stream_contents(stream_id)):
         if idx > 0 and (idx % 10) == 0:
-            print("[{}] fetched {} articles".format(datetime.now(), idx))
+            LOGGER.info("fetched %d articles", idx)
 
         title = article.title
         text = article.text
@@ -327,10 +363,43 @@ def fetch_articles(outfile, stream_id, out_format):
             print('* {}\n'.format(title), file=fout)
             print(text + '\n', file=fout)
 
-    print("[{}] fetched {} articles and saved them in {}".format(datetime.now(), idx + 1, outfile))
+    LOGGER.info("fetched %d articles and saved them in %s", idx + 1, outfile)
 
     fout.close()
 
 
+@main.command()
+@click.option("-f", "--folder", help="Folder you want to deduplicate")
+@click.option("-t", "--thresh", type=float, default=0.8,
+              help="Minimum similarity score")
+def dedupe(folder, thresh):
+    """Deduplicate articles"""
+    client = get_client()
+    matched_articles, index = [], InvIndex()
+    for idx, article in enumerate(client.fetch_unread(folder=folder)):
+        if idx > 0 and (idx % 10) == 0:
+            LOGGER.info("fetched %d articles and found %d duplicate", idx, len(matched_articles))
+
+        related = index.retrieve(article.title, k=10)
+        sims = Counter()
+        for docid, doc, _ in related:
+            if docid == article.id:
+                continue
+            sims[doc] = sim_of(doc, article.title, method='cosine', term='char', ngram_range=(2, 3))
+
+        if sims and max(sims.values()) >= thresh:
+            top_doc, top_score = sims.most_common()[0]
+            print("article 「{}」 is duplicate with  -> 「{}」".format(
+                article.title, top_doc
+            ))
+            matched_articles.append(article)
+            continue
+
+        index.add_doc(article)
+
+    LOGGER.info("fetched %d articles and found %d duplicate", idx + 1, len(matched_articles))
+    apply_action(matched_articles, client, 'mark_as_read', None)
+
+
 if __name__ == '__main__':
     main()
diff --git a/inoreader/sim.py b/inoreader/sim.py
@@ -0,0 +1,158 @@
+import re
+import pickle
+from math import sqrt
+from collections import Counter
+from difflib import SequenceMatcher
+from collections import defaultdict
+
+PUNCTS_PAT = re.compile(
+    r'(?:[#\$&@.,;:!?，。！？、：；  \u3300\'`"~_\+\-\*\/\\|\\^=<>\[\]\(\)\{\}（）“”‘’\s]|'
+    r'[\u2000-\u206f]|'
+    r'[\u3000-\u303f]|'
+    r'[\uff30-\uff4f]|'
+    r'[\uff00-\uff0f\uff1a-\uff20\uff3b-\uff40\uff5b-\uff65])+'
+)
+
+
+def make_terms(text, term, ngram_range=None, lower=True, ignore_punct=True, gram_as_tuple=False):
+    if lower:
+        text = text.lower()
+    if term == 'word':
+        # term_seq = [word.strip() for word in jieba.cut(text) if word.strip()]
+        term_seq = [word.strip() for word in text.split() if word.strip()]
+    elif term == 'char':
+        term_seq = list(re.sub(r'\s', '', text))
+    else:
+        raise ValueError("unsupported term type: {}".foramt(term))
+
+    if ngram_range and not (len(ngram_range) == 2 and ngram_range[0] < ngram_range[1]):
+        raise ValueError("wrong `ngram_range`: {}".foramt(ngram_range))
+
+    terms = []
+    min_ngram, max_ngram = ngram_range or (1, 2)
+    for idx in range(0, max(1, len(term_seq) - min_ngram + 1)):
+        cur_grams = []
+        for gram_level in range(min_ngram, max_ngram):
+            if gram_as_tuple:
+                gram = tuple(term_seq[idx:idx + gram_level])
+            else:
+                gram = ''.join(term_seq[idx:idx + gram_level])
+            if gram not in cur_grams:
+                if ignore_punct and any(PUNCTS_PAT.match(item) for item in gram):
+                    pass
+                else:
+                    cur_grams.append(gram)
+        terms.extend(cur_grams)
+    return terms
+
+
+def lcs_sim(s1, s2, term='char', ngram_range=None, ngram_weights=None,
+            lower=True, ignore_punct=True):
+    s1_terms = make_terms(s1, 'char', None, lower, ignore_punct)
+    s2_terms = make_terms(s2, 'char', None, lower, ignore_punct)
+    return SequenceMatcher(a=s1_terms, b=s2_terms).ratio()
+
+
+def jaccard_sim(s1, s2, term='word', ngram_range=None, ngram_weights=None,
+                lower=True, ignore_punct=True):
+    if not ngram_range or ngram_range[1] == ngram_range[0] + 1:
+        first_term_set = set(make_terms(s1, term, ngram_range, lower, ignore_punct))
+        second_term_set = set(make_terms(s2, term, ngram_range, lower, ignore_punct))
+        if not first_term_set and not second_term_set:
+            return 1.0
+        return len(first_term_set & second_term_set) / len(first_term_set | second_term_set)
+    else:
+        weights = ngram_weights or list(range(*ngram_range))
+        weights_sum = sum(weights)
+        weights = [weight / weights_sum for weight in weights]
+        scores = []
+        for ngram_level in range(*ngram_range):
+            score = jaccard_sim(s1, s2, term=term,
+                                ngram_range=(ngram_level, ngram_level + 1),
+                                lower=lower, ignore_punct=ignore_punct)
+            scores.append(score)
+
+        return sum([score * weight for score, weight in zip(scores, weights)])
+
+
+def cosine_sim(s1, s2, term='word', ngram_range=None, ngram_weights=None,
+               lower=True, ignore_punct=True):
+    if not ngram_range or ngram_range[1] == ngram_range[0] + 1:
+        first_term_freq = Counter(make_terms(s1, term, ngram_range, lower, ignore_punct))
+        second_term_freq = Counter(make_terms(s2, term, ngram_range, lower, ignore_punct))
+
+        first_norm = 0
+        second_norm = 0
+        inner_product = 0
+
+        for term, freq in first_term_freq.items():
+            first_norm += freq ** 2
+            inner_product += freq * second_term_freq[term]
+
+        for term, freq in second_term_freq.items():
+            second_norm += freq ** 2
+
+        if first_norm == 0 and second_norm == 0:
+            return 1.0
+        if first_norm == 0 or second_norm == 0:
+            return 0.0
+
+        return inner_product / sqrt(first_norm * second_norm)
+    else:
+        weights = ngram_weights or list(range(*ngram_range))
+        weights_sum = sum(weights)
+        weights = [weight / weights_sum for weight in weights]
+        scores = []
+        for ngram_level in range(*ngram_range):
+            score = cosine_sim(s1, s2, term=term,
+                               ngram_range=(ngram_level, ngram_level + 1),
+                               lower=lower, ignore_punct=ignore_punct)
+            scores.append(score)
+
+        return sum([score * weight for score, weight in zip(scores, weights)])
+
+
+def sim_of(s1, s2, method='cosine', term='word', ngram_range=None, lower=True, ignore_punct=True):
+    method_func = {
+        'lcs': lcs_sim,
+        'jaccard': jaccard_sim,
+        'cosine': cosine_sim,
+    }.get(method)
+    if not method_func:
+        raise ValueError("unsupported method: {}".format(method))
+
+    return method_func(s1, s2, term=term, ngram_range=ngram_range,
+                       lower=lower, ignore_punct=ignore_punct)
+
+
+class InvIndex(object):
+    def __init__(self):
+        """build inverted index with ngram method"""
+        self._id2doc = {}
+        self._index = defaultdict(set)
+
+    def add_doc(self, doc):
+        if doc.id in self._id2doc:
+            return False
+
+        self._id2doc[doc.id] = doc.title
+        terms = set(make_terms(doc.title, 'char', (3, 4)))
+        for term in terms:
+            self._index[term].add(doc.id)
+
+        return True
+
+    def retrieve(self, query, k=10):
+        related = Counter()
+        terms = set(make_terms(query, 'char', (3, 4)))
+        for term in terms:
+            for qid in self._index.get(term, []):
+                related[qid] += 1
+
+        return [(idx, self._id2doc[idx], score) for idx, score in related.most_common(k)]
+
+    def save(self, fname):
+        pickle.dump((self._id2doc, self._index), open(fname, 'wb'))
+
+    def load(self, fname):
+        self._id2doc, self._index = pickle.load(open(fname, 'rb'))
diff --git a/setup.py b/setup.py