Skip to content

Commit ba31433

Browse files
authored
Merge pull request #2 from hscells/limiter_v1
Limiter v1
2 parents 437ef5d + 5484457 commit ba31433

File tree

4 files changed

+107
-13
lines changed

4 files changed

+107
-13
lines changed

Pipfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ aiofile = "*"
2828
click = "*"
2929
seaborn = "*"
3030
prompt-toolkit = "*"
31-
lucene = { path = "./pylucene/dist/lucene-9.4.1-cp310-cp310-macosx_12_0_arm64.whl" }
31+
lucene = { path = "./pylucene/dist/lucene-10.0.0-cp310-cp310-macosx_12_0_arm64.whl" }
3232
lupyne = {editable = true, ref = "90874273a33bc02b0ac502f2daed2fde326f7511", git = "https://github.com/coady/lupyne.git"}
3333
snakeviz = "*"
3434
pip = "*"

install_pylucene.sh

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT
1212

1313
# Change these variables as needed.
1414
mirror=downloads
15-
lucene_version=9.12.0
15+
lucene_version=10.0.0
1616
ant_version=1.10.14
1717

1818
# Download pylucene and ant.
@@ -33,8 +33,10 @@ export PATH="$PATH:$(pwd)/ant/bin"
3333
# https://lucene.apache.org/pylucene/jcc/install.html
3434
cd pylucene
3535
pushd jcc
36-
# IMPORTANT: Need java 17
37-
export JCC_INCLUDES=/opt/homebrew/Cellar/openjdk@17/17.0.15/libexec/openjdk.jdk/Contents/Home/include:/opt/homebrew/Cellar/openjdk@17/17.0.15/libexec/openjdk.jdk/Contents/Home/include/darwin
36+
# IMPORTANT: Need java 17 (21 for pylucene v10.0.0)
37+
#export JCC_INCLUDES=/opt/homebrew/Cellar/openjdk@17/17.0.15/libexec/openjdk.jdk/Contents/Home/include:/opt/homebrew/Cellar/openjdk@17/17.0.15/libexec/openjdk.jdk/Contents/Home/include/darwin
38+
export JCC_INCLUDES=/opt/homebrew/Cellar/openjdk@21/21.0.9/libexec/openjdk.jdk/Contents/Home/include:/opt/homebrew/Cellar/openjdk@21/21.0.9/libexec/openjdk.jdk/Contents/Home/include/darwin
39+
3840
uv run setup.py build
3941
uv run setup.py install
4042
popd

src/pybool_ir/cli/__main__.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,31 @@ def experiment():
5757
required=True,
5858
help="location to download Pubmed baseline"
5959
)
60-
def pubmed_download(baseline_path: Path):
60+
61+
@click.option(
62+
"-l",
63+
"--limit",
64+
"limit",
65+
type=int,
66+
default=0,
67+
help="number of documents, that should be downloaded"
68+
)
69+
70+
# TODO check for deletion
71+
# def pubmed_download(baseline_path: Path):
72+
# from pybool_ir.datasets.pubmed.baseline import download_baseline
73+
# download_baseline(Path(baseline_path))
74+
75+
# TODO check for correct function
76+
def pubmed_download(baseline_path: Path, limit: int):
6177
from pybool_ir.datasets.pubmed.baseline import download_baseline
62-
download_baseline(Path(baseline_path))
78+
79+
if limit > 0:
80+
print(f"Start download of {limit} documents to {baseline_path}...")
81+
download_baseline(Path(baseline_path), limit)
82+
else:
83+
print(f"Download full baseline to {baseline_path}...")
84+
download_baseline(Path(baseline_path))
6385

6486

6587
@pubmed.command("process")

src/pybool_ir/datasets/pubmed/baseline.py

Lines changed: 77 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,88 @@
77
from pybool_ir.datasets.pubmed.datautils import FTP_URL, FTP_BASELINE_CWD
88

99

10-
def download_baseline(path: Path):
10+
# def download_baseline(path: Path):
11+
# with FTP(host=FTP_URL, user="anonymous") as ftp:
12+
# ftp.cwd(FTP_BASELINE_CWD)
13+
# files = []
14+
# ftp.dir(files.append)
15+
16+
# os.makedirs(str(path), exist_ok=True)
17+
18+
# for filename in reversed(datautils.dir_to_filenames(files)):
19+
# if os.path.exists(str(path / filename)):
20+
# print(f"found {path / filename}, skipping")
21+
# continue
22+
# util.download_file("https://" + FTP_URL + FTP_BASELINE_CWD + filename, path / filename)
23+
24+
# ftp.close()
25+
26+
# currently used
27+
# def download_baseline(path: Path, limit: int = None):
28+
# with FTP(host=FTP_URL, user="anonymous") as ftp:
29+
# ftp.cwd(FTP_BASELINE_CWD)
30+
# files = []
31+
# ftp.dir(files.append)
32+
33+
# os.makedirs(str(path), exist_ok=True)
34+
35+
# filenames = reversed(datautils.dir_to_filenames(files))
36+
37+
# if limit is not None and limit > 0:
38+
# filenames = list(filenames)[:limit]
39+
# print(f"Limit set: Downloading first {limit} documents ...")
40+
41+
# for filename in filenames:
42+
# if os.path.exists(str(path / filename)):
43+
# print(f"found {path / filename}, skipping")
44+
# continue
45+
46+
# util.download_file("https://" + FTP_URL + FTP_BASELINE_CWD + filename, path / filename)
47+
48+
# ftp.close()
49+
50+
# test version -> parallel download
51+
from concurrent.futures import ThreadPoolExecutor, as_completed
52+
from ftplib import FTP
53+
from pathlib import Path
54+
import time
55+
import os
56+
57+
from pybool_ir import util
58+
from pybool_ir.datasets.pubmed import datautils
59+
from pybool_ir.datasets.pubmed.datautils import FTP_URL, FTP_BASELINE_CWD
60+
61+
62+
def download_baseline(path: Path, limit: int = None, workers: int = 2, retries: int = 3):
1163
with FTP(host=FTP_URL, user="anonymous") as ftp:
1264
ftp.cwd(FTP_BASELINE_CWD)
1365
files = []
1466
ftp.dir(files.append)
1567

1668
os.makedirs(str(path), exist_ok=True)
69+
filenames = list(reversed(datautils.dir_to_filenames(files)))
70+
71+
if limit is not None and limit > 0:
72+
filenames = filenames[:limit]
73+
print(f"Limit set: Downloading first {limit} documents ...")
74+
75+
def download_one(filename):
76+
target = path / filename
77+
if target.exists():
78+
return f"skip {filename}"
79+
80+
url = "https://" + FTP_URL + FTP_BASELINE_CWD + filename
1781

18-
for filename in reversed(datautils.dir_to_filenames(files)):
19-
if os.path.exists(str(path / filename)):
20-
print(f"found {path / filename}, skipping")
21-
continue
22-
util.download_file("https://" + FTP_URL + FTP_BASELINE_CWD + filename, path / filename)
82+
for attempt in range(1, retries + 1):
83+
try:
84+
util.download_file(url, target)
85+
return f"done {filename}"
86+
except Exception as e:
87+
if attempt == retries:
88+
return f"FAILED {filename}: {e}"
89+
time.sleep(2 * attempt)
2390

24-
ftp.close()
91+
with ThreadPoolExecutor(max_workers=workers) as executor:
92+
futures = [executor.submit(download_one, fn) for fn in filenames]
93+
for f in as_completed(futures):
94+
print(f.result())

0 commit comments

Comments
 (0)