Skip to content

Commit c5381ff

Browse files
committed
Update download_pdfs.py
1 parent 7d2a846 commit c5381ff

File tree

2 files changed

+20
-3
lines changed

2 files changed

+20
-3
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
.DS_Store
22
*.zip
3+
downloaded_pdfs

download_pdfs.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ def extract_pdf_info(text):
1313
pattern = r'\|.*?\|\s*🔥*\[([^\]]+)\].*?\[\[(.*?pdf.*?)\]\]\((https://[^\)]+)\)'
1414
matches = re.findall(pattern, text)
1515
info_list = []
16+
# Add default pdf: https://github.com/xlite-dev/Awesome-LLM-Inference/releases/download/v0.3/Awesome-LLM-Inference-v0.3.pdf.zip
17+
info_list.append((
18+
"Awesome-LLM-Inference-v0.3",
19+
"https://github.com/xlite-dev/Awesome-LLM-Inference/releases/download/v0.3/Awesome-LLM-Inference-v0.3.pdf.zip"
20+
))
1621
for title, _, link in matches:
1722
# Remove special characters from the paper title to avoid illegal file names.
1823
valid_title = re.sub(r'[\\/*?:"<>|]', '', title)
@@ -34,8 +39,17 @@ def download_file(title, url):
3439
download_dir = 'downloaded_pdfs'
3540
if not os.path.exists(download_dir):
3641
os.makedirs(download_dir)
37-
38-
file_name = os.path.join(download_dir, f"{title}.pdf")
42+
# Use the paper title as the file name.
43+
# Replace any invalid characters in the title with underscores.
44+
if not url.endswith('.zip'):
45+
file_name = os.path.join(download_dir, f"{title}.pdf")
46+
else:
47+
file_name = os.path.join(download_dir, f"{title}.zip")
48+
# Check if the file already exists.
49+
if os.path.exists(file_name):
50+
print(f"File {file_name} already exists. Skipping download.")
51+
return True
52+
# Get the total size of the file from the response headers.
3953
total_size = int(response.headers.get('content-length', 0))
4054

4155
print(f"Downloading paper {title}, file name is {file_name}...")
@@ -45,6 +59,8 @@ def download_file(title, url):
4559
unit='B',
4660
unit_scale=True,
4761
unit_divisor=1024,
62+
colour="yellow",
63+
ascii=True,
4864
) as bar:
4965
for data in response.iter_content(chunk_size=1024):
5066
size = file.write(data)
@@ -64,7 +80,7 @@ def main():
6480
pdf_info = extract_pdf_info(text)
6581
print(f"A total of {len(pdf_info)} PDF links were matched.")
6682

67-
for title, link in pdf_info:
83+
for title, link in tqdm(pdf_info, colour="blue"):
6884
download_file(title, link)
6985
except FileNotFoundError:
7086
print("README.md file not found. Please ensure the file exists in the current working directory.")

0 commit comments

Comments
 (0)