File tree Expand file tree Collapse file tree 1 file changed +26
-0
lines changed Expand file tree Collapse file tree 1 file changed +26
-0
lines changed Original file line number Diff line number Diff line change
1
+ #! /bin/bash
2
+ index_dir=" index"
3
+ papers_dir=" papers"
4
+ src_dir=" src"
5
+ mkdir -p " ${index_dir} " " ${papers_dir} " " ${src_dir} "
6
+
7
+ jq -r ' .[] | select(.arxiv_id) | "/"+.arxiv_id+"."' pwc/papers-with-abstracts.json | sort -u > wildcards.txt
8
+ aws s3 cp --request-payer requester s3://arxiv/src/arXiv_src_manifest.xml .
9
+ xmllint --xpath ' //filename/text()' arXiv_src_manifest.xml > tars.txt
10
+
11
+ process_file () {
12
+ path=" $1 "
13
+ archive_name=$( basename " ${path} " )
14
+ file=" ${src_dir} /${archive_name} "
15
+ echo " Processing ${file} ..."
16
+ [ -e " ${file} " ] && echo " Already exists, skipping..." && return
17
+ aws s3 cp --request-payer requester " s3://arxiv/${path} " " ${src_dir} "
18
+ tar -tvf " ${file} " > " ${index_dir} /${archive_name} .ls"
19
+ tar -tf " ${file} " > " ${index_dir} /${archive_name} .txt"
20
+ fgrep -f wildcards.txt " ${index_dir} /${archive_name} .txt" > to_extract.txt && xargs -a to_extract.txt -- tar xf " ${file} " -C " ${papers_dir} "
21
+ }
22
+
23
+ while read file
24
+ do
25
+ process_file " ${file} "
26
+ done < tars.txt
You can’t perform that action at this time.
0 commit comments