diff --git a/.gitignore b/.gitignore index 734f6d4..11ef9a4 100755 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,16 @@ muss *access *SBU +# Debug/output files +output*.txt +error_log.txt +install_log.txt +*.log + +# IDE settings +.vscode/ +.idea/ + *sub_captions # C extensions *.so diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100755 index 457f44d..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "python.analysis.typeCheckingMode": "basic" -} \ No newline at end of file diff --git a/SimSum/Bart_baseline_finetuned.py b/SimSum/Bart_baseline_finetuned.py index f952d57..95ad23f 100755 --- a/SimSum/Bart_baseline_finetuned.py +++ b/SimSum/Bart_baseline_finetuned.py @@ -29,8 +29,8 @@ from torch.utils.data import Dataset, DataLoader import pytorch_lightning as pl from pytorch_lightning.trainer import seed_everything +from torch.optim import AdamW from transformers import ( - AdamW, T5ForConditionalGeneration, T5TokenizerFast, BertTokenizer, BertForPreTraining, @@ -215,7 +215,7 @@ def train_dataloader(self): drop_last=True, shuffle=True, pin_memory=True, - num_workers=4) + num_workers=0) t_total = ((len(dataloader.dataset) // (self.args.train_batch_size * max(1, self.args.n_gpu))) // self.args.gradient_accumulation_steps * float(self.args.num_train_epochs) @@ -233,7 +233,7 @@ def val_dataloader(self): sample_size=self.args.valid_sample_size) return DataLoader(val_dataset, batch_size=self.args.valid_batch_size, - num_workers=4) + num_workers=0) @staticmethod def add_model_specific_args(parent_parser): p = ArgumentParser(parents=[parent_parser],add_help = False) diff --git a/SimSum/preprocessor.py b/SimSum/preprocessor.py index 8ced15a..6a70e23 100755 --- a/SimSum/preprocessor.py +++ b/SimSum/preprocessor.py @@ -65,7 +65,7 @@ def tokenize(sentence): def write_lines(lines, filepath): filepath = Path(filepath) filepath.parent.mkdir(parents=True, exist_ok=True) - with filepath.open("w") as fout: + with filepath.open("w", encoding='utf-8') as fout: for line in lines: fout.write(line + '\n') @@ -76,21 +76,21 @@ def read_lines(filepath): def yield_lines(filepath): filepath = Path(filepath) - with filepath.open('r') as f: + with filepath.open('r', encoding='utf-8') as f: for line in f: yield line.rstrip() def yield_sentence_pair_with_index(filepath1, filepath2): index = 0 - with Path(filepath1).open('r') as f1, Path(filepath2).open('r') as f2: + with Path(filepath1).open('r', encoding='utf-8') as f1, Path(filepath2).open('r', encoding='utf-8') as f2: for line1, line2 in zip(f1, f2): index += 1 yield (line1.rstrip(), line2.rstrip(), index) def yield_sentence_pair(filepath1, filepath2): - with Path(filepath1).open('r') as f1, Path(filepath2).open('r') as f2: + with Path(filepath1).open('r', encoding='utf-8') as f1, Path(filepath2).open('r', encoding='utf-8') as f2: for line1, line2 in zip(f1, f2): yield line1.rstrip(), line2.rstrip() @@ -98,7 +98,7 @@ def yield_sentence_pair(filepath1, filepath2): def count_line(filepath): filepath = Path(filepath) line_count = 0 - with filepath.open("r") as f: + with filepath.open("r", encoding='utf-8') as f: for line in f: line_count += 1 return line_count diff --git a/env.yml b/env.yml deleted file mode 100755 index 57cf45f..0000000 --- a/env.yml +++ /dev/null @@ -1,167 +0,0 @@ -name: xinyu -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=5.1=1_gnu - - blas=1.0=mkl - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.07.19=h06a4308_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h295c915_3 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - joblib=1.1.0=pyhd3eb1b0_0 - - jpeg=9e=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.38=h1181459_1 - - libffi=3.3=he6710b0_2 - - libgcc-ng=11.2.0=h1234567_1 - - libgomp=11.2.0=h1234567_1 - - libiconv=1.16=h7f8727e_2 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=11.2.0=h1234567_1 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h2818925_1 - - libunistring=0.9.10=h27cfd23_0 - - libuuid=1.0.3=h7f8727e_2 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl_fft=1.3.1=py310hd6ae3a3_0 - - mkl_random=1.2.2=py310h00e6091_0 - - ncurses=6.3=h5eee18b_3 - - nettle=3.7.3=hbbd107a_1 - - nltk=3.7=pyhd3eb1b0_0 - - numpy-base=1.23.1=py310hcba007f_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1q=h7f8727e_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - python=3.10.4=h12debd9_0 - - pytorch=1.12.1=py3.10_cuda11.3_cudnn8.3.2_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.5=hc218d9a_0 - - tk=8.6.12=h1ccaba5_0 - - typing_extensions=4.3.0=py310h06a4308_0 - - tzdata=2022a=hda174b7_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7f8727e_1 - - zlib=1.2.12=h7f8727e_2 - - zstd=1.5.2=ha4553b6_0 - - pip: - - absl-py==1.2.0 - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - alembic==1.8.1 - - async-timeout==4.0.2 - - attrs==22.1.0 - - autopage==0.5.1 - - bert-score==0.3.11 - - blis==0.7.8 - - cachetools==5.2.0 - - catalogue==2.0.8 - - certifi==2022.6.15 - - cffi==1.15.1 - - charset-normalizer==2.1.0 - - click==8.0.4 - - cliff==3.10.1 - - cmaes==0.8.2 - - cmd2==2.4.2 - - colorama==0.4.5 - - colorlog==6.6.0 - - cryptography==37.0.1 - - cycler==0.11.0 - - cymem==2.0.6 - - filelock==3.7.1 - - fonttools==4.34.4 - - frozenlist==1.3.1 - - fsspec==2022.7.1 - - google-auth==2.10.0 - - google-auth-oauthlib==0.4.6 - - greenlet==1.1.2 - - grpcio==1.47.0 - - huggingface-hub==0.8.1 - - jarowinkler==1.2.0 - - jinja2==3.1.2 - - kiwisolver==1.4.4 - - langcodes==3.3.0 - - levenshtein==0.20.2 - - lxml==4.9.1 - - mako==1.2.1 - - markdown==3.4.1 - - markupsafe==2.1.1 - - matplotlib==3.5.2 - - mkl-fft==1.3.1 - - mkl-random==1.2.2 - - mkl-service==2.4.0 - - multidict==6.0.2 - - murmurhash==1.0.7 - - numpy==1.23.1 - - oauthlib==3.2.0 - - optuna==2.10.1 - - packaging==21.3 - - pandas==1.4.3 - - pathy==0.6.2 - - pbr==5.9.0 - - pillow==9.2.0 - - pip==22.1.2 - - portalocker==2.5.1 - - preshed==3.0.6 - - prettytable==3.3.0 - - protobuf==3.19.4 - - pyasn1==0.4.8 - - pyasn1-modules==0.2.8 - - pydantic==1.9.1 - - pydeprecate==0.3.2 - - pyparsing==3.0.9 - - pyperclip==1.8.2 - - python-dateutil==2.8.2 - - pytorch-lightning==1.7.0 - - pytz==2022.1 - - pyyaml==6.0 - - rapidfuzz==2.4.2 - - regex==2022.7.9 - - requests==2.28.1 - - requests-oauthlib==1.3.1 - - rsa==4.9 - - sacrebleu==2.2.0 - - sacremoses==0.0.53 - - scipy==1.9.0 - - setuptools==61.2.0 - - smart-open==5.2.1 - - spacy==3.4.1 - - spacy-legacy==3.0.9 - - spacy-loggers==1.0.3 - - sqlalchemy==1.4.39 - - srsly==2.4.4 - - stevedore==4.0.0 - - summarizer==0.0.7 - - tabulate==0.8.10 - - tensorboard==2.9.1 - - tensorboard-data-server==0.6.1 - - tensorboard-plugin-wit==1.8.1 - - thinc==8.1.0 - - tokenizers==0.12.1 - - torch==1.12.1 - - torchmetrics==0.9.3 - - tqdm==4.64.0 - - transformers==4.21.1 - - typer==0.4.2 - - typing-extensions==4.3.0 - - urllib3==1.26.11 - - wasabi==0.10.1 - - wcwidth==0.2.5 - - werkzeug==2.2.1 - - yarl==1.8.1 -prefix: /home/xinyzhou/.conda/envs/xinyu diff --git a/requirements.txt b/requirements.txt index 3a6cf4b..e5fc3a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,26 +1,26 @@ -bert_score==0.3.11 -click==8.0.4 -keybert==0.7.0 -matplotlib==3.5.2 -nltk==3.7 -numpy==1.23.1 -optuna==2.10.1 -pandas==1.4.3 -plotly==5.14.1 -python_Levenshtein==0.21.0 -pytorch_lightning==1.7.0 -rouge==1.0.1 -sacrebleu==2.2.0 -sacremoses==0.0.53 -scikit_learn==1.2.2 -simalign==0.3 -spacy==3.4.1 -stanfordnlp==0.2.0 -summarizer==0.0.7 -torch==1.12.1 -torchfile==0.1.0 -tqdm==4.64.0 -transformers==4.21.1 -tupa==1.4.2 -ucca==1.3.11 -yattag==1.15.1 +bert_score>=0.3.11 +click>=8.0.4 +keybert>=0.7.0 +matplotlib>=3.5.2 +nltk>=3.7 +numpy>=1.23.1 +optuna>=2.10.1 +pandas>=1.4.3 +plotly>=5.14.1 +python_Levenshtein>=0.21.0 +pytorch_lightning>=1.7.0 +rouge>=1.0.1 +sacrebleu>=2.2.0 +sacremoses>=0.0.53 +scikit_learn>=1.2.2 +simalign>=0.3 +spacy>=3.4.1 +stanfordnlp>=0.2.0 +summarizer>=0.0.7 +torch +torchfile>=0.1.0 +tqdm>=4.64.0 +transformers>=4.21.1 +tupa>=1.4.2 +ucca>=1.3.11 +yattag>=1.15.1