diff --git a/crf_baseline/README.md b/crf_baseline/README.md index 3b0385c..a7bb1f1 100644 --- a/crf_baseline/README.md +++ b/crf_baseline/README.md @@ -22,8 +22,13 @@ The data is expected to be in a *dataset* folder, in the main repository directo * [main_threeTasks](main_threeTasks.py) python script to train one CRF model for each task. * [validation](validation.py) python script to compute classification score on validation dataset for the three tasks. -## Dependencies -* Numpy: 1.13.3 -* Sklearn : 0.19.1 -* [Sklearn crfsuite](https://sklearn-crfsuite.readthedocs.io/en/latest/index.html) Sklearn crfsuite : 0.3.6 -* Python 3.5 \ No newline at end of file +## Dependencies + +* Python 3.7 + +See [requirements.txt](./requirements.txt) for a complete list of depdendencies. + +For setup, first create a new virtual environment using your favoured method, then install dependencies with: + +``` +pip3 install -r requirements.txt diff --git a/crf_baseline/code/utils.py b/crf_baseline/code/utils.py index f7d3421..a4d576e 100644 --- a/crf_baseline/code/utils.py +++ b/crf_baseline/code/utils.py @@ -12,16 +12,42 @@ def closePrintToFile(f, stdout_original): sys.stdout = stdout_original f.close() +def load_data(filepath): + """ + Load and return the data stored in the given path. + The data is structured as follows: + Each line contains four columns separated by a single space. + Each word has been put on a separate line and there is an empty line after each sentence. + The first item on each line is a word, the second, third and fourth are tags related to the word. + Example: + The sentence "L. Antonielli, Iprefetti dell' Italia napoleonica, Bologna 1983." is represented in the dataset as: + L author b-secondary b-r + . author i-secondary i-r + Antonielli author i-secondary i-r + , author i-secondary i-r + Iprefetti title i-secondary i-r + dell title i-secondary i-r + ’ title i-secondary i-r + Italia title i-secondary i-r + napoleonica title i-secondary i-r + , title i-secondary i-r + Bologna publicationplace i-secondary i-r + 1983 year e-secondary i-r + . year e-secondary e-r -def load_data(file): + :param filepath: Path to the data + :return: Four arrays: The first one contains sentences (one array of words per sentence) and the other threes are arrays of tags. + + """ + + # Arrays to return words = [] tags_1 = [] tags_2 = [] tags_3 = [] - tags_4 = [] - word = tags1 = tags2 = tags3 = tags4 = [] - with open (file, "r") as file: + word = tags1 = tags2 = tags3 = [] + with open (filepath, "r") as file: for line in file: if 'DOCSTART' not in line: #Do not take the first line into consideration # Check if empty line @@ -31,14 +57,12 @@ def load_data(file): tags_1.append(tags1) tags_2.append(tags2) tags_3.append(tags3) - tags_4.append(tags4) # Reset word = [] tags1 = [] tags2 = [] tags3 = [] - tags4 = [] else: # Split the line into words, tag #1, tag #2, tag #3 @@ -47,6 +71,43 @@ def load_data(file): tags1.append(w[1]) tags2.append(w[2]) tags3.append(w[3]) - tags4.append(w[4]) - return words,tags_1,tags_2,tags_3,tags_4 \ No newline at end of file + return words,tags_1,tags_2,tags_3 + +#def load_data(file): +# words = [] +# tags_1 = [] +# tags_2 = [] +# tags_3 = [] +# tags_4 = [] +# +# word = tags1 = tags2 = tags3 = tags4 = [] +# with open (file, "r") as file: +# for line in file: +# if 'DOCSTART' not in line: #Do not take the first line into consideration +# # Check if empty line +# if line in ['\n', '\r\n']: +# # Append line +# words.append(word) +# tags_1.append(tags1) +# tags_2.append(tags2) +# tags_3.append(tags3) +# tags_4.append(tags4) +# +# # Reset +# word = [] +# tags1 = [] +# tags2 = [] +# tags3 = [] +# tags4 = [] +# +# else: +# # Split the line into words, tag #1, tag #2, tag #3 +# w = line[:-1].split(" ") +# word.append(w[0]) +# tags1.append(w[1]) +# tags2.append(w[2]) +# tags3.append(w[3]) +# tags4.append(w[4]) +# +# return words,tags_1,tags_2,tags_3,tags_4 diff --git a/crf_baseline/main_finetune.py b/crf_baseline/main_finetune.py index 4b35ebf..16179b9 100644 --- a/crf_baseline/main_finetune.py +++ b/crf_baseline/main_finetune.py @@ -12,8 +12,8 @@ import sklearn_crfsuite from sklearn_crfsuite import scorers, metrics from sklearn.metrics import make_scorer, confusion_matrix -from sklearn.externals import joblib from sklearn.model_selection import RandomizedSearchCV +import joblib # For model validation import scipy diff --git a/crf_baseline/main_threeTasks.py b/crf_baseline/main_threeTasks.py index aad29d3..802fed3 100644 --- a/crf_baseline/main_threeTasks.py +++ b/crf_baseline/main_threeTasks.py @@ -9,7 +9,7 @@ import sklearn_crfsuite from sklearn_crfsuite import scorers, metrics from sklearn.metrics import make_scorer, confusion_matrix -from sklearn.externals import joblib +import joblib # Utils functions diff --git a/crf_baseline/models/.gitkeep b/crf_baseline/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/crf_baseline/requirements.txt b/crf_baseline/requirements.txt new file mode 100644 index 0000000..600222f --- /dev/null +++ b/crf_baseline/requirements.txt @@ -0,0 +1,15 @@ +cycler==0.10.0 +joblib==0.13.2 +kiwisolver==1.1.0 +matplotlib==3.1.1 +numpy==1.17.0 +pyparsing==2.4.2 +python-crfsuite==0.9.6 +python-dateutil==2.8.0 +scikit-learn==0.21.3 +scipy==1.3.1 +six==1.12.0 +sklearn==0.0 +sklearn-crfsuite==0.3.6 +tabulate==0.8.3 +tqdm==4.33.0 diff --git a/crf_baseline/results/.gitkeep b/crf_baseline/results/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/keras/README.md b/keras/README.md index 053c849..c593408 100644 --- a/keras/README.md +++ b/keras/README.md @@ -18,10 +18,14 @@ The results will be stored into the *model_results* folder, with one directory c * [main_threeTasks](main_threeTasks.py) python script to train one NN model for each task. ## Dependencies -* Keras : version 2.1.1 -* TensorFlow: 1.4.0 -* Numpy: 1.13.3 -* [Keras contrib](https://github.com/keras-team/keras-contrib) Keras contrib : 0.0.2 -* Sklearn : 0.19.1 -* [Sklearn crfsuite](https://sklearn-crfsuite.readthedocs.io/en/latest/index.html) Sklearn crfsuite : 0.3.6 -* Python 3.5 + +* Python 3.7 + +See [requirements.txt](./requirements.txt) for a complete list of depdendencies. + +For setup, first create a new virtual environment using your favoured method, then install dependencies with: + +``` +pip3 install -r requirements.txt + +``` diff --git a/keras/code/utils.py b/keras/code/utils.py index 3965e00..bd30526 100644 --- a/keras/code/utils.py +++ b/keras/code/utils.py @@ -378,8 +378,8 @@ def on_train_begin(self, logs={}): self.params['metrics'].append("val_f1") # In case of multiple outputs - if len(self.model.output_layers) > 1: - for output_layer in self.model.output_layers: + if len(self.model.layers) > 1: + for output_layer in self.model.layers: self.params['metrics'].append("val_"+output_layer.name+"_f1") @@ -403,8 +403,8 @@ def compute_epoch_training_F1(self): """ Compute and save the F1 score for the training data """ - in_length = len(self.model.input_layers) - out_length = len(self.model.output_layers) + in_length = len(self.model._input_layers) + out_length = len(self.model.layers) predictions = self.model.predict(self.train_data[0]) if len(predictions) != out_length: predictions = [predictions] @@ -464,8 +464,8 @@ def on_epoch_end(self, epoch, logs={}): Same model's weights for the best epoch. """ self.compute_epoch_training_F1() - in_length = len(self.model.input_layers) # X data - to predict from - out_length = len(self.model.output_layers) # Number of tasks + in_length = len(self.model._input_layers) # X data - to predict from + out_length = len(self.model.layers) # Number of tasks # Compute the model predictions predictions = self.model.predict(self.validation_data[:in_length]) @@ -493,7 +493,7 @@ def on_epoch_end(self, epoch, logs={}): vals_f1.append(_val_f1) # Add F1 score to be log - f1_name = "val_"+self.model.output_layers[i].name+"_f1" + f1_name = "val_"+self.model.layers[i].name+"_f1" logs[f1_name] = _val_f1 diff --git a/keras/requirements.txt b/keras/requirements.txt new file mode 100644 index 0000000..0375b86 --- /dev/null +++ b/keras/requirements.txt @@ -0,0 +1,36 @@ +absl-py==0.7.1 +astor==0.8.0 +bleach==1.5.0 +cycler==0.10.0 +gast==0.2.2 +google-pasta==0.1.7 +grpcio==1.22.0 +h5py==2.9.0 +html5lib==0.9999999 +joblib==0.13.2 +Keras==2.2.4 +Keras-Applications==1.0.8 +keras-contrib==2.0.8 +Keras-Preprocessing==1.1.0 +kiwisolver==1.1.0 +Markdown==3.1.1 +matplotlib==3.1.1 +numpy==1.17.0 +protobuf==3.9.1 +pyparsing==2.4.2 +python-crfsuite==0.9.6 +python-dateutil==2.8.0 +PyYAML==5.1.2 +scikit-learn==0.21.3 +scipy==1.3.0 +six==1.12.0 +sklearn-crfsuite==0.3.6 +tabulate==0.8.3 +tensorboard==1.14.0 +tensorflow==1.14.0 +tensorflow-estimator==1.14.0 +tensorflow-tensorboard==1.5.1 +termcolor==1.1.0 +tqdm==4.32.2 +Werkzeug==0.15.5 +wrapt==1.11.2