-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_documents.py
More file actions
executable file
·60 lines (46 loc) · 1.41 KB
/
process_documents.py
File metadata and controls
executable file
·60 lines (46 loc) · 1.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python3
"""
Process the documents pipeline.
Usage:
python process_documents.py [--year YYYY] [pdfs|podcasts|posts|images|md|all]
Notes:
- Instapaper HTML articles are downloaded and processed without special
starring rules.
"""
import argparse
from pipeline_manager import DocumentProcessor, PIPELINE_TARGETS
import config as cfg
def parse_args():
p = argparse.ArgumentParser(
description=(
"Document pipeline: podcasts, Instapaper, PDFs, images, and Markdown."
),
epilog=None,
)
p.add_argument("--year", type=int,
help="Use this year instead of the default (DOCPIPE_YEAR or current year)")
p.add_argument(
"targets",
nargs="+",
choices=[*PIPELINE_TARGETS, "all"],
help="Process only the specified types",
)
return p.parse_args()
def get_year_from_args_and_env(args) -> int:
"""Get the year from CLI arguments or environment variables."""
if args.year:
return args.year
return cfg.get_default_year()
def main():
args = parse_args()
year = get_year_from_args_and_env(args)
# Create processor.
processor = DocumentProcessor(cfg.BASE_DIR, year)
if "all" in args.targets:
success = processor.process_all()
else:
success = processor.process_targets(args.targets)
if not success:
exit(1)
if __name__ == "__main__":
main()