Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ aiofile = "*"
click = "*"
seaborn = "*"
prompt-toolkit = "*"
lucene = { path = "./pylucene/dist/lucene-9.4.1-cp310-cp310-macosx_12_0_arm64.whl" }
lucene = { path = "./pylucene/dist/lucene-10.0.0-cp310-cp310-macosx_12_0_arm64.whl" }
lupyne = {editable = true, ref = "90874273a33bc02b0ac502f2daed2fde326f7511", git = "https://github.com/coady/lupyne.git"}
snakeviz = "*"
pip = "*"
Expand Down
8 changes: 5 additions & 3 deletions install_pylucene.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT

# Change these variables as needed.
mirror=downloads
lucene_version=9.12.0
lucene_version=10.0.0
ant_version=1.10.14

# Download pylucene and ant.
Expand All @@ -33,8 +33,10 @@ export PATH="$PATH:$(pwd)/ant/bin"
# https://lucene.apache.org/pylucene/jcc/install.html
cd pylucene
pushd jcc
# IMPORTANT: Need java 17
export JCC_INCLUDES=/opt/homebrew/Cellar/openjdk@17/17.0.15/libexec/openjdk.jdk/Contents/Home/include:/opt/homebrew/Cellar/openjdk@17/17.0.15/libexec/openjdk.jdk/Contents/Home/include/darwin
# IMPORTANT: Need java 17 (21 for pylucene v10.0.0)
#export JCC_INCLUDES=/opt/homebrew/Cellar/openjdk@17/17.0.15/libexec/openjdk.jdk/Contents/Home/include:/opt/homebrew/Cellar/openjdk@17/17.0.15/libexec/openjdk.jdk/Contents/Home/include/darwin
export JCC_INCLUDES=/opt/homebrew/Cellar/openjdk@21/21.0.9/libexec/openjdk.jdk/Contents/Home/include:/opt/homebrew/Cellar/openjdk@21/21.0.9/libexec/openjdk.jdk/Contents/Home/include/darwin

uv run setup.py build
uv run setup.py install
popd
Expand Down
26 changes: 24 additions & 2 deletions src/pybool_ir/cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,31 @@ def experiment():
required=True,
help="location to download Pubmed baseline"
)
def pubmed_download(baseline_path: Path):

@click.option(
"-l",
"--limit",
"limit",
type=int,
default=0,
help="number of documents, that should be downloaded"
)

# TODO check for deletion
# def pubmed_download(baseline_path: Path):
# from pybool_ir.datasets.pubmed.baseline import download_baseline
# download_baseline(Path(baseline_path))

# TODO check for correct function
def pubmed_download(baseline_path: Path, limit: int):
from pybool_ir.datasets.pubmed.baseline import download_baseline
download_baseline(Path(baseline_path))

if limit > 0:
print(f"Start download of {limit} documents to {baseline_path}...")
download_baseline(Path(baseline_path), limit)
else:
print(f"Download full baseline to {baseline_path}...")
download_baseline(Path(baseline_path))


@pubmed.command("process")
Expand Down
84 changes: 77 additions & 7 deletions src/pybool_ir/datasets/pubmed/baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,88 @@
from pybool_ir.datasets.pubmed.datautils import FTP_URL, FTP_BASELINE_CWD


def download_baseline(path: Path):
# def download_baseline(path: Path):
# with FTP(host=FTP_URL, user="anonymous") as ftp:
# ftp.cwd(FTP_BASELINE_CWD)
# files = []
# ftp.dir(files.append)

# os.makedirs(str(path), exist_ok=True)

# for filename in reversed(datautils.dir_to_filenames(files)):
# if os.path.exists(str(path / filename)):
# print(f"found {path / filename}, skipping")
# continue
# util.download_file("https://" + FTP_URL + FTP_BASELINE_CWD + filename, path / filename)

# ftp.close()

# currently used
# def download_baseline(path: Path, limit: int = None):
# with FTP(host=FTP_URL, user="anonymous") as ftp:
# ftp.cwd(FTP_BASELINE_CWD)
# files = []
# ftp.dir(files.append)

# os.makedirs(str(path), exist_ok=True)

# filenames = reversed(datautils.dir_to_filenames(files))

# if limit is not None and limit > 0:
# filenames = list(filenames)[:limit]
# print(f"Limit set: Downloading first {limit} documents ...")

# for filename in filenames:
# if os.path.exists(str(path / filename)):
# print(f"found {path / filename}, skipping")
# continue

# util.download_file("https://" + FTP_URL + FTP_BASELINE_CWD + filename, path / filename)

# ftp.close()

# test version -> parallel download
from concurrent.futures import ThreadPoolExecutor, as_completed
from ftplib import FTP
from pathlib import Path
import time
import os

from pybool_ir import util
from pybool_ir.datasets.pubmed import datautils
from pybool_ir.datasets.pubmed.datautils import FTP_URL, FTP_BASELINE_CWD


def download_baseline(path: Path, limit: int = None, workers: int = 2, retries: int = 3):
with FTP(host=FTP_URL, user="anonymous") as ftp:
ftp.cwd(FTP_BASELINE_CWD)
files = []
ftp.dir(files.append)

os.makedirs(str(path), exist_ok=True)
filenames = list(reversed(datautils.dir_to_filenames(files)))

if limit is not None and limit > 0:
filenames = filenames[:limit]
print(f"Limit set: Downloading first {limit} documents ...")

def download_one(filename):
target = path / filename
if target.exists():
return f"skip {filename}"

url = "https://" + FTP_URL + FTP_BASELINE_CWD + filename

for filename in reversed(datautils.dir_to_filenames(files)):
if os.path.exists(str(path / filename)):
print(f"found {path / filename}, skipping")
continue
util.download_file("https://" + FTP_URL + FTP_BASELINE_CWD + filename, path / filename)
for attempt in range(1, retries + 1):
try:
util.download_file(url, target)
return f"done {filename}"
except Exception as e:
if attempt == retries:
return f"FAILED {filename}: {e}"
time.sleep(2 * attempt)

ftp.close()
with ThreadPoolExecutor(max_workers=workers) as executor:
futures = [executor.submit(download_one, fn) for fn in filenames]
for f in as_completed(futures):
print(f.result())