Skip to content

Commit 0ca3c34

Browse files
committed
optimize command fetch-starred
1 parent 1cf4668 commit 0ca3c34

File tree

1 file changed

+97
-35
lines changed

1 file changed

+97
-35
lines changed

inoreader/main.py

Lines changed: 97 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from __future__ import print_function, unicode_literals
33

44
import os
5+
import re
56
import csv
67
import sys
78
import json
@@ -10,6 +11,7 @@
1011
import threading
1112
from queue import Queue
1213
from uuid import uuid4
14+
from operator import itemgetter
1315
from functools import partial, wraps
1416
from logging.config import dictConfig
1517
from collections import defaultdict, Counter
@@ -26,6 +28,7 @@
2628
from inoreader.exception import NotLoginError, APIError
2729
from inoreader.config import InoreaderConfigManager
2830
from inoreader.consts import DEFAULT_APPID, DEFAULT_APPKEY
31+
from inoreader.utils import download_image
2932

3033

3134
APPID_ENV_NAME = 'INOREADER_APP_ID'
@@ -459,59 +462,118 @@ def dedupe(folder, thresh):
459462
apply_action(matched_articles, client, 'mark_as_read', None)
460463

461464

462-
@main.command()
465+
@main.command("fetch-starred")
463466
@click.option("-f", "--folder", help='Folder which articles belong to')
464467
@click.option("-t", "--tags", help="Tag(s) for filtering, seprate with comma")
465-
@click.option("-o", "--outfile", required=True, help="Filename to save articles")
468+
@click.option("-o", "--outfile",
469+
help="Filename to save articles, required when output format is `csv`")
470+
@click.option("-d", "--outdir",
471+
help="Directory to save articles, required when output format is not `csv`")
472+
@click.option("-l", "--limit", type=int)
473+
@click.option("--save-image", is_flag=True)
466474
@click.option("--out-format",
467-
type=click.Choice(['json', 'csv', 'plain', 'markdown', 'org-mode']),
475+
type=click.Choice(['json', 'csv', 'markdown', 'org-mode']),
468476
default='json',
469477
help='Format of output file, default: json')
470478
@catch_error
471-
def fetch_starred(folder, tags, outfile, out_format):
479+
def fetch_starred(folder, tags, outfile, outdir, limit, save_image, out_format):
472480
"""Fetch starred articles"""
473481
client = get_client()
474482

483+
if out_format == 'csv' and not outfile:
484+
click.secho("`outfile` is required!", fg="red")
485+
return -1
486+
elif out_format != 'csv' and not outdir:
487+
click.secho("`outdir` is required!", fg="red")
488+
return -1
489+
490+
if out_format == 'csv':
491+
fout = codecs.open(outfile, mode='w', encoding='utf-8')
492+
writer = csv.writer(fout, delimiter=',',
493+
quoting=csv.QUOTE_ALL) if out_format == 'csv' else None
494+
elif not os.path.exists(outdir):
495+
os.makedirs(outdir)
496+
475497
tag_list = [] if not tags else tags.split(',')
476-
fout = codecs.open(outfile, mode='w', encoding='utf-8')
477-
writer = csv.writer(fout, delimiter=',') if out_format == 'csv' else None
478-
for idx, article in enumerate(client.fetch_starred(folder=folder, tags=tag_list)):
479-
if idx > 0 and (idx % 10) == 0:
480-
LOGGER.info("fetched %d articles", idx)
498+
url_to_image = {}
499+
fetched_count = 0
500+
for article in client.fetch_starred(folder=folder, tags=tag_list, limit=limit):
501+
if limit and fetched_count >= limit:
502+
break
481503

504+
fetched_count += 1
482505
title = article.title
483506
text = article.text
484507
link = article.link
485-
if out_format == 'json':
486-
print(json.dumps({'title': title, 'content': text, 'url': link}, ensure_ascii=False),
487-
file=fout)
488-
elif out_format == 'csv':
508+
if out_format == 'csv':
489509
writer.writerow([link, title, text])
490-
elif out_format == 'plain':
491-
print('TITLE: {}'.format(title), file=fout)
492-
print("LINK: {}".format(link), file=fout)
493-
print("CONTENT: {}".format(text), file=fout)
494-
print(file=fout)
495-
elif out_format == 'markdown':
496-
if link:
497-
print('# [{}]({})\n'.format(title, link), file=fout)
498-
else:
499-
print('# {}\n'.format(title), file=fout)
510+
continue
500511

501-
print(text + '\n', file=fout)
512+
filename = re.sub(r'\s+', '_', title)
513+
filename = re.sub(r'[\[\]\(\)()]', '_', filename)
514+
filename = re.sub(r'[“”\'"]', '', filename)
515+
if out_format == 'json':
516+
filename += '.json'
517+
elif out_format == 'markdown':
518+
filename += '.md'
502519
elif out_format == 'org-mode':
503-
if link:
504-
title = title.replace('[', '_').replace(']', '_')
505-
print('* [[{}][{}]]\n'.format(link, title),
506-
file=fout)
507-
else:
508-
print('* {}\n'.format(title), file=fout)
509-
510-
print(text + '\n', file=fout)
511-
512-
LOGGER.info("fetched %d articles and saved them in %s", idx + 1, outfile)
520+
filename += '.org'
521+
522+
if save_image:
523+
image_contents = re.findall(r'!\[(?:[^\[\]]+)\]\((?:[^\(\)]+)\)', text)
524+
for image_content in image_contents:
525+
match = re.match(r'!\[(?P<alt>[^\[\]]+)\]\((?P<url>[^\(\)]+)\)', image_content)
526+
image_alt, image_url = itemgetter('alt', 'url')(match.groupdict())
527+
if image_url in url_to_image:
528+
text = text.replace(
529+
image_content,
530+
'![{}]({})'.format(image_alt, url_to_image[image_url])
531+
)
532+
continue
533+
534+
image_filename = ''
535+
if not re.findall(r'[\?\!\/=\&]', image_alt):
536+
image_filename = re.sub(r'\.[a-z]+$', '', image_alt)
537+
else:
538+
image_filename = str(uuid4()).replace('-', '')
539+
540+
return_image_file = download_image(
541+
image_url,
542+
outdir,
543+
image_filename,
544+
proxies=client.proxies
545+
)
546+
if return_image_file:
547+
LOGGER.info('Download image as "%s" from "%s"', return_image_file, image_url)
548+
text = text.replace(
549+
image_content,
550+
'![{}]({})'.format(image_alt, return_image_file)
551+
)
552+
url_to_image[image_url] = return_image_file
553+
554+
with open(os.path.join(outdir, filename), 'w') as fout:
555+
if out_format == 'json':
556+
json.dump(
557+
{'title': title, 'content': text, 'url': link},
558+
fout, ensure_ascii=False, indent=4
559+
)
560+
elif out_format == 'markdown':
561+
print(title + '\n=====\n\nLINK: ' + link + '\n\n', file=fout)
562+
text = re.sub(r'!\[([^\[\]]+)\]\(([^\(\)]+)\)', r'\n![\1](\2)\n', text)
563+
print(text + '\n', file=fout)
564+
elif out_format == 'org-mode':
565+
print('#+TITLE: ' + title + '\n\nLINK: ' + link + '\n\n', file=fout)
566+
text = re.sub(r'!\[([^\[\]]+)\]\(([^\(\)]+)\)', r'\n[[file:\2][\1]]\n', text)
567+
text = re.sub(r'\[([^\[\]]+)\]\(([^\(\)]+)\)', r'[[\2][\1]]', text)
568+
print(text + '\n', file=fout)
569+
570+
LOGGER.info('saved article "%s" in directory "%s"', title, outdir)
513571

514-
fout.close()
572+
if out_format == 'csv':
573+
fout.close()
574+
LOGGER.info("fetched %d articles and saved them in %s", fetched_count, outfile)
575+
else:
576+
LOGGER.info("fetched %d articles and saved them in %s", fetched_count, outdir)
515577

516578

517579
if __name__ == '__main__':

0 commit comments

Comments
 (0)