66import sys
77import json
88import codecs
9- from datetime import datetime
10- from collections import defaultdict
9+ import logging
10+ from logging .config import dictConfig
11+ from collections import defaultdict , Counter
1112from configparser import ConfigParser
1213
1314import yaml
1415import click
1516from inoreader import InoreaderClient
1617from inoreader .filter import get_filter
18+ from inoreader .sim import sim_of , InvIndex
1719
1820
1921APPID_ENV_NAME = 'INOREADER_APP_ID'
2224ENV_NAMES = [APPID_ENV_NAME , APPKEY_ENV_NAME , TOKEN_ENV_NAME ]
2325
2426CONFIG_FILE = os .path .join (os .environ .get ('HOME' ), '.inoreader' )
27+ LOGGER = logging .getLogger (__name__ )
28+
29+
30+ dictConfig ({
31+ 'version' : 1 ,
32+ 'formatters' : {
33+ 'simple' : {
34+ 'format' : '%(asctime)s - %(filename)s:%(lineno)s: %(message)s' ,
35+ }
36+ },
37+ 'handlers' : {
38+ 'default' : {
39+ 'level' : 'DEBUG' ,
40+ 'class' : 'logging.StreamHandler' ,
41+ 'formatter' : 'simple' ,
42+ "stream" : "ext://sys.stdout" ,
43+ },
44+ },
45+ 'loggers' : {
46+ '__main__' : {
47+ 'handlers' : ['default' ],
48+ 'level' : 'DEBUG' ,
49+ 'propagate' : False
50+ },
51+ 'inoreader' : {
52+ 'handlers' : ['default' ],
53+ 'level' : 'DEBUG' ,
54+ 'propagate' : True
55+ },
56+ }
57+ })
2558
2659
2760def read_config ():
@@ -48,15 +81,15 @@ def get_client():
4881 config = read_config ()
4982 appid , appkey = get_appid_key (config )
5083 if not appid or not appkey :
51- print ("'appid' or 'appkey' is missing" )
84+ LOGGER . error ("'appid' or 'appkey' is missing" )
5285 sys .exit (1 )
5386
5487 token = None
5588 if config .has_section ('auth' ):
5689 token = config .get ('auth' , 'token' )
5790 token = token or os .environ .get (TOKEN_ENV_NAME )
5891 if not token :
59- print ("Please login first" )
92+ LOGGER . error ("Please login first" )
6093 sys .exit (1 )
6194
6295 userid = None
@@ -81,7 +114,7 @@ def login():
81114 password = input ("PASSWORD: " ).strip ()
82115 status = client .login (username , password )
83116 if status :
84- print ("Login as '{}'" . format ( username ) )
117+ LOGGER . info ("Login as '%s'" , username )
85118 auth_token = client .auth_token
86119 config = read_config ()
87120 if 'auth' in config :
@@ -94,9 +127,9 @@ def login():
94127 config ['user' ] = {'email' : username , 'id' : client .userinfo ()['userId' ]}
95128 with codecs .open (CONFIG_FILE , mode = 'w' , encoding = 'utf-8' ) as fconfig :
96129 config .write (fconfig )
97- print ("save token in {}, " . format ( username , CONFIG_FILE ) )
130+ LOGGER . info ("save token in config file '%s'" , CONFIG_FILE )
98131 else :
99- print ("Login failed: Wrong username or password" )
132+ LOGGER . info ("Login failed: Wrong username or password" )
100133 sys .exit (1 )
101134
102135
@@ -136,7 +169,7 @@ def fetch_unread(folder, tags, outfile, out_format):
136169 writer = csv .writer (fout , delimiter = ',' ) if out_format == 'csv' else None
137170 for idx , article in enumerate (client .fetch_unread (folder = folder , tags = tag_list )):
138171 if idx > 0 and (idx % 10 ) == 0 :
139- print ( "[{}] fetched {} articles". format ( datetime . now () , idx ) )
172+ LOGGER . info ( " fetched %d articles" , idx )
140173 title = article .title
141174 text = article .text
142175 if out_format == 'json' :
@@ -154,7 +187,7 @@ def fetch_unread(folder, tags, outfile, out_format):
154187 print ('* {}\n ' .format (title ), file = fout )
155188 print (text + '\n ' , file = fout )
156189
157- print ( "[{}] fetched {} articles and saved them in {}" . format ( datetime . now () , idx + 1 , outfile ) )
190+ LOGGER . info ( " fetched %d articles and saved them in %s" , idx + 1 , outfile )
158191
159192 fout .close ()
160193
@@ -165,23 +198,23 @@ def apply_action(articles, client, action, tags):
165198 client .add_tag (articles , tag )
166199
167200 for article in articles :
168- print ("Add tags [{} ] on article: {}" . format ( tags , article .title ) )
201+ LOGGER . info ("Add tags [%s ] on article: %s" , tags , article .title )
169202 elif action == 'mark_as_read' :
170203 client .mark_as_read (articles )
171204 for article in articles :
172- print ("Mark article as read: {}" . format ( article .title ) )
205+ LOGGER . info ("Mark article as read: %s" , article .title )
173206 elif action == 'like' :
174207 client .mark_as_liked (articles )
175208 for article in articles :
176- print ("Mark article as liked: {}" . format ( article .title ) )
209+ LOGGER . info ("Mark article as liked: %s" , article .title )
177210 elif action == 'broadcast' :
178211 client .broadcast (articles )
179212 for article in articles :
180- print ("Boradcast article: {}" . format ( article .title ) )
213+ LOGGER . info ("Boradcast article: {}" , article .title )
181214 elif action == 'star' :
182215 client .mark_as_starred (articles )
183216 for article in articles :
184- print ("Starred article: {}" . format ( article .title ) )
217+ LOGGER . info ("Starred article: {}" , article .title )
185218
186219
187220@main .command ("filter" )
@@ -241,8 +274,11 @@ def filter_articles(rules_file):
241274 matched_articles [action ['type' ]].append ((article , action ))
242275
243276 count += 1
244- print ("[{}] matched {} articles with filter: {}" .format (
245- datetime .now (), count , rule ['name' ]))
277+
278+ LOGGER .info (
279+ "matched %d articles in folder(s) %s with filter named '%s'" ,
280+ count , rule ['folders' ], rule ['name' ]
281+ )
246282
247283 for action_name in matched_articles :
248284 articles , actions = zip (* matched_articles [action_name ])
@@ -308,7 +344,7 @@ def fetch_articles(outfile, stream_id, out_format):
308344
309345 for idx , article in enumerate (client .get_stream_contents (stream_id )):
310346 if idx > 0 and (idx % 10 ) == 0 :
311- print ( "[{}] fetched {} articles". format ( datetime . now () , idx ) )
347+ LOGGER . info ( " fetched %d articles" , idx )
312348
313349 title = article .title
314350 text = article .text
@@ -327,10 +363,43 @@ def fetch_articles(outfile, stream_id, out_format):
327363 print ('* {}\n ' .format (title ), file = fout )
328364 print (text + '\n ' , file = fout )
329365
330- print ( "[{}] fetched {} articles and saved them in {}" . format ( datetime . now () , idx + 1 , outfile ) )
366+ LOGGER . info ( " fetched %d articles and saved them in %s" , idx + 1 , outfile )
331367
332368 fout .close ()
333369
334370
371+ @main .command ()
372+ @click .option ("-f" , "--folder" , help = "Folder you want to deduplicate" )
373+ @click .option ("-t" , "--thresh" , type = float , default = 0.8 ,
374+ help = "Minimum similarity score" )
375+ def dedupe (folder , thresh ):
376+ """Deduplicate articles"""
377+ client = get_client ()
378+ matched_articles , index = [], InvIndex ()
379+ for idx , article in enumerate (client .fetch_unread (folder = folder )):
380+ if idx > 0 and (idx % 10 ) == 0 :
381+ LOGGER .info ("fetched %d articles and found %d duplicate" , idx , len (matched_articles ))
382+
383+ related = index .retrieve (article .title , k = 10 )
384+ sims = Counter ()
385+ for docid , doc , _ in related :
386+ if docid == article .id :
387+ continue
388+ sims [doc ] = sim_of (doc , article .title , method = 'cosine' , term = 'char' , ngram_range = (2 , 3 ))
389+
390+ if sims and max (sims .values ()) >= thresh :
391+ top_doc , top_score = sims .most_common ()[0 ]
392+ print ("article 「{}」 is duplicate with -> 「{}」" .format (
393+ article .title , top_doc
394+ ))
395+ matched_articles .append (article )
396+ continue
397+
398+ index .add_doc (article )
399+
400+ LOGGER .info ("fetched %d articles and found %d duplicate" , idx + 1 , len (matched_articles ))
401+ apply_action (matched_articles , client , 'mark_as_read' , None )
402+
403+
335404if __name__ == '__main__' :
336405 main ()
0 commit comments