|
2 | 2 | from __future__ import print_function, unicode_literals |
3 | 3 |
|
4 | 4 | import os |
| 5 | +import re |
5 | 6 | import csv |
6 | 7 | import sys |
7 | 8 | import json |
|
10 | 11 | import threading |
11 | 12 | from queue import Queue |
12 | 13 | from uuid import uuid4 |
| 14 | +from operator import itemgetter |
13 | 15 | from functools import partial, wraps |
14 | 16 | from logging.config import dictConfig |
15 | 17 | from collections import defaultdict, Counter |
|
26 | 28 | from inoreader.exception import NotLoginError, APIError |
27 | 29 | from inoreader.config import InoreaderConfigManager |
28 | 30 | from inoreader.consts import DEFAULT_APPID, DEFAULT_APPKEY |
| 31 | +from inoreader.utils import download_image |
29 | 32 |
|
30 | 33 |
|
31 | 34 | APPID_ENV_NAME = 'INOREADER_APP_ID' |
@@ -459,59 +462,118 @@ def dedupe(folder, thresh): |
459 | 462 | apply_action(matched_articles, client, 'mark_as_read', None) |
460 | 463 |
|
461 | 464 |
|
462 | | -@main.command() |
| 465 | +@main.command("fetch-starred") |
463 | 466 | @click.option("-f", "--folder", help='Folder which articles belong to') |
464 | 467 | @click.option("-t", "--tags", help="Tag(s) for filtering, seprate with comma") |
465 | | -@click.option("-o", "--outfile", required=True, help="Filename to save articles") |
| 468 | +@click.option("-o", "--outfile", |
| 469 | + help="Filename to save articles, required when output format is `csv`") |
| 470 | +@click.option("-d", "--outdir", |
| 471 | + help="Directory to save articles, required when output format is not `csv`") |
| 472 | +@click.option("-l", "--limit", type=int) |
| 473 | +@click.option("--save-image", is_flag=True) |
466 | 474 | @click.option("--out-format", |
467 | | - type=click.Choice(['json', 'csv', 'plain', 'markdown', 'org-mode']), |
| 475 | + type=click.Choice(['json', 'csv', 'markdown', 'org-mode']), |
468 | 476 | default='json', |
469 | 477 | help='Format of output file, default: json') |
470 | 478 | @catch_error |
471 | | -def fetch_starred(folder, tags, outfile, out_format): |
| 479 | +def fetch_starred(folder, tags, outfile, outdir, limit, save_image, out_format): |
472 | 480 | """Fetch starred articles""" |
473 | 481 | client = get_client() |
474 | 482 |
|
| 483 | + if out_format == 'csv' and not outfile: |
| 484 | + click.secho("`outfile` is required!", fg="red") |
| 485 | + return -1 |
| 486 | + elif out_format != 'csv' and not outdir: |
| 487 | + click.secho("`outdir` is required!", fg="red") |
| 488 | + return -1 |
| 489 | + |
| 490 | + if out_format == 'csv': |
| 491 | + fout = codecs.open(outfile, mode='w', encoding='utf-8') |
| 492 | + writer = csv.writer(fout, delimiter=',', |
| 493 | + quoting=csv.QUOTE_ALL) if out_format == 'csv' else None |
| 494 | + elif not os.path.exists(outdir): |
| 495 | + os.makedirs(outdir) |
| 496 | + |
475 | 497 | tag_list = [] if not tags else tags.split(',') |
476 | | - fout = codecs.open(outfile, mode='w', encoding='utf-8') |
477 | | - writer = csv.writer(fout, delimiter=',') if out_format == 'csv' else None |
478 | | - for idx, article in enumerate(client.fetch_starred(folder=folder, tags=tag_list)): |
479 | | - if idx > 0 and (idx % 10) == 0: |
480 | | - LOGGER.info("fetched %d articles", idx) |
| 498 | + url_to_image = {} |
| 499 | + fetched_count = 0 |
| 500 | + for article in client.fetch_starred(folder=folder, tags=tag_list, limit=limit): |
| 501 | + if limit and fetched_count >= limit: |
| 502 | + break |
481 | 503 |
|
| 504 | + fetched_count += 1 |
482 | 505 | title = article.title |
483 | 506 | text = article.text |
484 | 507 | link = article.link |
485 | | - if out_format == 'json': |
486 | | - print(json.dumps({'title': title, 'content': text, 'url': link}, ensure_ascii=False), |
487 | | - file=fout) |
488 | | - elif out_format == 'csv': |
| 508 | + if out_format == 'csv': |
489 | 509 | writer.writerow([link, title, text]) |
490 | | - elif out_format == 'plain': |
491 | | - print('TITLE: {}'.format(title), file=fout) |
492 | | - print("LINK: {}".format(link), file=fout) |
493 | | - print("CONTENT: {}".format(text), file=fout) |
494 | | - print(file=fout) |
495 | | - elif out_format == 'markdown': |
496 | | - if link: |
497 | | - print('# [{}]({})\n'.format(title, link), file=fout) |
498 | | - else: |
499 | | - print('# {}\n'.format(title), file=fout) |
| 510 | + continue |
500 | 511 |
|
501 | | - print(text + '\n', file=fout) |
| 512 | + filename = re.sub(r'\s+', '_', title) |
| 513 | + filename = re.sub(r'[\[\]\(\)()]', '_', filename) |
| 514 | + filename = re.sub(r'[“”\'"]', '', filename) |
| 515 | + if out_format == 'json': |
| 516 | + filename += '.json' |
| 517 | + elif out_format == 'markdown': |
| 518 | + filename += '.md' |
502 | 519 | elif out_format == 'org-mode': |
503 | | - if link: |
504 | | - title = title.replace('[', '_').replace(']', '_') |
505 | | - print('* [[{}][{}]]\n'.format(link, title), |
506 | | - file=fout) |
507 | | - else: |
508 | | - print('* {}\n'.format(title), file=fout) |
509 | | - |
510 | | - print(text + '\n', file=fout) |
511 | | - |
512 | | - LOGGER.info("fetched %d articles and saved them in %s", idx + 1, outfile) |
| 520 | + filename += '.org' |
| 521 | + |
| 522 | + if save_image: |
| 523 | + image_contents = re.findall(r'!\[(?:[^\[\]]+)\]\((?:[^\(\)]+)\)', text) |
| 524 | + for image_content in image_contents: |
| 525 | + match = re.match(r'!\[(?P<alt>[^\[\]]+)\]\((?P<url>[^\(\)]+)\)', image_content) |
| 526 | + image_alt, image_url = itemgetter('alt', 'url')(match.groupdict()) |
| 527 | + if image_url in url_to_image: |
| 528 | + text = text.replace( |
| 529 | + image_content, |
| 530 | + ''.format(image_alt, url_to_image[image_url]) |
| 531 | + ) |
| 532 | + continue |
| 533 | + |
| 534 | + image_filename = '' |
| 535 | + if not re.findall(r'[\?\!\/=\&]', image_alt): |
| 536 | + image_filename = re.sub(r'\.[a-z]+$', '', image_alt) |
| 537 | + else: |
| 538 | + image_filename = str(uuid4()).replace('-', '') |
| 539 | + |
| 540 | + return_image_file = download_image( |
| 541 | + image_url, |
| 542 | + outdir, |
| 543 | + image_filename, |
| 544 | + proxies=client.proxies |
| 545 | + ) |
| 546 | + if return_image_file: |
| 547 | + LOGGER.info('Download image as "%s" from "%s"', return_image_file, image_url) |
| 548 | + text = text.replace( |
| 549 | + image_content, |
| 550 | + ''.format(image_alt, return_image_file) |
| 551 | + ) |
| 552 | + url_to_image[image_url] = return_image_file |
| 553 | + |
| 554 | + with open(os.path.join(outdir, filename), 'w') as fout: |
| 555 | + if out_format == 'json': |
| 556 | + json.dump( |
| 557 | + {'title': title, 'content': text, 'url': link}, |
| 558 | + fout, ensure_ascii=False, indent=4 |
| 559 | + ) |
| 560 | + elif out_format == 'markdown': |
| 561 | + print(title + '\n=====\n\nLINK: ' + link + '\n\n', file=fout) |
| 562 | + text = re.sub(r'!\[([^\[\]]+)\]\(([^\(\)]+)\)', r'\n\n', text) |
| 563 | + print(text + '\n', file=fout) |
| 564 | + elif out_format == 'org-mode': |
| 565 | + print('#+TITLE: ' + title + '\n\nLINK: ' + link + '\n\n', file=fout) |
| 566 | + text = re.sub(r'!\[([^\[\]]+)\]\(([^\(\)]+)\)', r'\n[[file:\2][\1]]\n', text) |
| 567 | + text = re.sub(r'\[([^\[\]]+)\]\(([^\(\)]+)\)', r'[[\2][\1]]', text) |
| 568 | + print(text + '\n', file=fout) |
| 569 | + |
| 570 | + LOGGER.info('saved article "%s" in directory "%s"', title, outdir) |
513 | 571 |
|
514 | | - fout.close() |
| 572 | + if out_format == 'csv': |
| 573 | + fout.close() |
| 574 | + LOGGER.info("fetched %d articles and saved them in %s", fetched_count, outfile) |
| 575 | + else: |
| 576 | + LOGGER.info("fetched %d articles and saved them in %s", fetched_count, outdir) |
515 | 577 |
|
516 | 578 |
|
517 | 579 | if __name__ == '__main__': |
|
0 commit comments