Skip to content

Commit 35d7961

Browse files
committed
Add script to download arxiv papers
Downloads tars from an arxiv's requester pays s3 bucket and extracts files found in papers-with-abstracts.json
1 parent baca6a3 commit 35d7961

File tree

1 file changed

+26
-0
lines changed

1 file changed

+26
-0
lines changed

download_arxiv_ml_papers.sh

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
index_dir="index"
3+
papers_dir="papers"
4+
src_dir="src"
5+
mkdir -p "${index_dir}" "${papers_dir}" "${src_dir}"
6+
7+
jq -r '.[] | select(.arxiv_id) | "/"+.arxiv_id+"."' pwc/papers-with-abstracts.json | sort -u > wildcards.txt
8+
aws s3 cp --request-payer requester s3://arxiv/src/arXiv_src_manifest.xml .
9+
xmllint --xpath '//filename/text()' arXiv_src_manifest.xml > tars.txt
10+
11+
process_file () {
12+
path="$1"
13+
archive_name=$(basename "${path}")
14+
file="${src_dir}/${archive_name}"
15+
echo "Processing ${file}..."
16+
[ -e "${file}" ] && echo "Already exists, skipping..." && return
17+
aws s3 cp --request-payer requester "s3://arxiv/${path}" "${src_dir}"
18+
tar -tvf "${file}" > "${index_dir}/${archive_name}.ls"
19+
tar -tf "${file}" > "${index_dir}/${archive_name}.txt"
20+
fgrep -f wildcards.txt "${index_dir}/${archive_name}.txt" > to_extract.txt && xargs -a to_extract.txt -- tar xf "${file}" -C "${papers_dir}"
21+
}
22+
23+
while read file
24+
do
25+
process_file "${file}"
26+
done <tars.txt

0 commit comments

Comments
 (0)