Merge pull request #4 from zbMATHOpen/feat/add_by_file

physikerwelt · web-flow · commit 3aed0923d6df · 2021-09-02T14:49:51.000+02:00
Feat/add by file
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 ## Update package for the zbMATH Links API
 
-The purpose of this package is to populate and update the database used by another package produced at [zbMATH](https://zbmath.org/), namely the zbMATH Links API `zbmath-links-api`, available [here](https://github.com/zbMATHOpen/linksApi). 
+The purpose of this package is to populate and update the database used by another package produced at [zbMATH](https://zbmath.org/), namely the zbMATH Links API `zbmath-links-api`, available [here](https://github.com/zbMATHOpen/linksApi).
 The usage of the present package is mainly described in the README file of the `zbmath-links-api` package.
 
 Here we provide some simple instructions to install and use this package.
@@ -14,8 +14,8 @@ On a first install:
     pip install -e .
     ```
 
-    This will install the package, `update-zblinks-api`, in the [virtual environment](https://docs.python.org/3/tutorial/venv.html). 
-    
+    This will install the package, `update-zblinks-api`, in the [virtual environment](https://docs.python.org/3/tutorial/venv.html).
+
 
 2) Fill in the `config_template.ini` and save it as `config.ini`.
 
@@ -27,47 +27,54 @@ On a first install:
     (iii) The API-KEY is the one used by the API package `zbmath-links-api`.
 
 
-3) The package has two entry points:
+3) The package has three entry points:
 
     (i) To scrape (i.e., to obtain all links) all zbMATH partners and update the database used by the package `zbmath-links-api` use the command
 
     ```
     update-api
     ```
-    
-    This will automatically add new links, delete links that no longer exist, and edit links that have been modified. 
-    
-    **Remark 1.** The present version of the package works with the [Digital Library of Mathematical Functions](https://dlmf.nist.gov/) (DLMF) as zbMATH partner. 
+
+    This will automatically add new links, delete links that no longer exist, and edit links that have been modified.
+
+    **Remark 1.** The present version of the package works with the [Digital Library of Mathematical Functions](https://dlmf.nist.gov/) (DLMF) as zbMATH partner.
     Therefore, one can use the command
-    
+
     ```
     update-api -p DLMF
     ```
-    
+
     to update the DLMF dataset managed by `zbmath-links-api`.
     In the next future, some scraping scripts for other partners will be integrated into this package, and the command
-    
+
     ```
     update-api
     ```
-    
+
     will do an automatic update of all links managed by `zbmath-links-api` for all partners.
 
     **Remark 2.** To generate CSV files (but not update the database) which can be used to manually update the database use the command
-    
+
     ```
     update-api --file
     ```
-    
-    This creates three CSV files: `new_links.csv`, `to_edit.csv`, `delete.csv` with the obvious contents, contained in the `update_zblinks_api/results` folder.
+
+    This creates three CSV files: `{partner}_new_links.csv`, `{partner}_to_edit.csv`, `{partner}_delete.csv` with the obvious contents, contained in the `update_zblinks_api/results` folder.
 
     (ii) Use the command
 
    ```
-   csv-initial -p DLMF
+   csv-initial -p <partner>
    ```
-   
-   to create two csv files with real DLMF data up to the year 2020: `DLMF_deids_table_init.csv` (to be inserted into the table `document_external_ids`) and   `DLMF_source_table_init.csv` (to be inserted into the table `source`). 
+
+   to create two csv files with real historical parter data: `{partner}_deids_table_init.csv` (to be inserted into the table `document_external_ids`) and   `{partner}_source_table_init.csv` (to be inserted into the table `source`).
    These files are contained in the `update_zblinks_api/results` folder.
 
+   (iii) Use the command
+
+   ```
+   csv-to-db
+   ```
+
+   to use the csv files from the output of update-api --file and export the information from the files to the database.
 
diff --git a/results/README.txt b/results/README.txt
@@ -1,3 +1,3 @@
 If the --file option is chosen, the results will be stored here.
 Three files will be created: "delete_links.csv", "new_links.csv", and "to_edit.csv"
-with the obvious contents in each file. 
+with the obvious contents in each file.
diff --git a/setup.cfg b/setup.cfg
@@ -31,6 +31,7 @@ where = src
 console_scripts =
     update-api = update_zblinks_api.update_with_api:update
     csv-initial = update_zblinks_api.matrix_table_datasets:create_matrix_table_datasets
+    csv-to-db = update_zblinks_api.update_with_api:use_files_to_update
 
 [pycodestyle]
 max-line-length = 79
diff --git a/src/update_zblinks_api/__init__.py b/src/update_zblinks_api/__init__.py
@@ -3,6 +3,8 @@
 
 # tuple of all partners for zblinks API
 partners = ("DLMF",)
+partners = tuple(p.lower() for p in partners)
+
 
 config = configparser.ConfigParser()
 config.read("config.ini")
diff --git a/src/update_zblinks_api/dlmf_scraping/historical/scrape_dlmf_historical.py b/src/update_zblinks_api/dlmf_scraping/historical/scrape_dlmf_historical.py
@@ -76,12 +76,12 @@ def create_source_table_dataset(df_hist):
     df_hist = df_hist.rename(columns={"external_id": "id"})
 
     df_hist["url"] = "https://dlmf.nist.gov/" + df_hist["id"]
-    df_hist["partner"] = "DLMF"
+    df_hist["partner"] = "dlmf"
 
     df_hist["id_scheme"] = "DLMF scheme"
     df_hist["type"] = "DLMF bibliographic entry"
 
-    df_hist = df_hist.drop_duplicates()
+    df_hist = df_hist.drop_duplicates(subset=["id"])
 
     column_order = ["id", "id_scheme", "type", "url", "title", "partner"]
     df_hist = df_hist.reindex(columns=column_order)
@@ -106,7 +106,7 @@ def get_df_dlmf_initial():
         columns=(["document", "external_id", "date", "title"]))
     for year in range(2008, 2021):
         df_scrape = get_df_dlmf(year)
-        df_new, df_edit, df_delete = separate_links("DLMF", df_main, df_scrape)
+        df_new, df_edit, df_delete = separate_links("dlmf", df_main, df_scrape)
         df_new["date"] = year
         df_main = pd.concat([df_main, df_new]).drop_duplicates(keep=False)
 
diff --git a/src/update_zblinks_api/helpers/source_helpers.py b/src/update_zblinks_api/helpers/source_helpers.py
@@ -75,7 +75,7 @@ def remove_lonely_sources(this_partner):
     LEFT OUTER JOIN document_external_ids
     	ON src.id = document_external_ids.external_id
         AND src.partner = document_external_ids.type
-    WHERE partner = %(partner_arg)s
+    WHERE src.parter = %(partner_arg)s
     AND document_external_ids.external_id IS NULL
     """
 
@@ -92,7 +92,7 @@ def remove_lonely_sources(this_partner):
             AND partner = %(partner_arg)s
         """
 
-        data = {"id_list": lonely_id_tuple, "partner_arg": "DLMF"}
+        data = {"id_list": lonely_id_tuple, "partner_arg": this_partner}
 
         with connection.cursor() as cursor:
             cursor.execute(delete_request, data)
diff --git a/src/update_zblinks_api/matrix_table_datasets.py b/src/update_zblinks_api/matrix_table_datasets.py
@@ -26,7 +26,7 @@ def create_deids_table_dataset(partner, df_hist):
     Parameters
     ----------
     partner : str
-        partner from which the initial datasets are to come.
+        partner (in lowercase) from which the initial datasets are to come.
     df_hist : dataframe
         contains columns: "document" (or "zbl_code"), "external_id",
         "date" (as int year).
@@ -35,7 +35,7 @@ def create_deids_table_dataset(partner, df_hist):
 
     df_hist = df_hist.rename(columns={"date": "matched_at"})
 
-    df_hist["type"] = partner.lower()
+    df_hist["type"] = partner
 
     df_hist["matched_at"] = pd.to_datetime(df_hist["matched_at"], format="%Y")
     df_hist["matched_at"] = (
@@ -75,8 +75,9 @@ def create_matrix_table_datasets(partner):
         partner from which the initial datasets are to come.
 
     """
+    partner = partner.lower()
 
     # this also creates the initial dataset for the zb_links.source table
-    df_init_partner = hist_scrape_dict[partner.lower()]()
+    df_init_partner = hist_scrape_dict[partner]()
 
     create_deids_table_dataset(partner, df_init_partner)
diff --git a/src/update_zblinks_api/update_with_api.py b/src/update_zblinks_api/update_with_api.py
@@ -153,6 +153,8 @@ def separate_links(partner, df_ext_partner, df_scrape):
         those entries from df_ext_partner which are to be deleted.
 
     """
+    partner = partner.lower()
+
     df_edit = pd.DataFrame(
         columns=(["document", "external_id", "title", "previous_ext_id"])
     )
@@ -169,7 +171,7 @@ def separate_links(partner, df_ext_partner, df_scrape):
     ).drop_duplicates(subset=["document", "external_id"], keep=False)
 
     # to update:
-    if partner == "DLMF":
+    if partner == "dlmf":
         df_new, df_edit, df_delete = dlmf_helpers.update(
             df_ext_partner, df_new, df_delete
         )
@@ -223,7 +225,7 @@ def scrape(partner):
 @click.option(
     "--file", is_flag=True,
     help="Use this option to write the data to csv files"
-         " instead of writing to the matrix"
+         "instead of writing to the matrix"
          "new_links.csv, to_edit.csv, delete_links.csv will be created"
 )
 def update(file):
@@ -245,9 +247,9 @@ def update(file):
         )
 
         if file:
-            df_new.to_csv("results/new_links.csv", index=False)
-            df_edit.to_csv("results/to_edit.csv", index=False)
-            df_delete.to_csv("results/delete_links.csv", index=False)
+            df_new.to_csv(f"results/{partner}_new_links.csv", index=False)
+            df_edit.to_csv(f"results/{partner}_to_edit.csv", index=False)
+            df_delete.to_csv(f"results/{partner}_delete_links.csv", index=False)
         else:
             df_new = df_new.fillna("")
             df_edit = df_edit.fillna("")
@@ -263,3 +265,44 @@ def update(file):
                 delete_request(row, partner)
 
             source_helpers.remove_lonely_sources(partner)
+
+
+def use_files_to_update():
+    """
+    For each partner, inserts the data from the csv files:
+    {partner}_new_links.csv, {partner}_to_edit.csv, {partner}_delete_links.csv
+    into the database
+    These files need to be located in the results folder.
+
+    Parameters
+    ----------
+    partner : str
+        zblinks API partner
+
+
+    """
+    for partner in partners:
+        insert_file = f"results/{partner}_new_links.csv"
+        try:
+            df_insert = pd.read_csv(insert_file)
+            df_insert = df_insert.fillna("")
+            for _, row in df_insert.iterrows():
+                post_request(row, partner)
+        except FileNotFoundError:
+            click.echo(f"Error: could not find {insert_file}.")
+
+        try:
+            edit_file = f"results/{partner}_to_edit.csv"
+            df_edit = pd.read_csv(edit_file)
+            for _, row in df_edit.iterrows():
+                update_request(row, partner)
+        except FileNotFoundError:
+            click.echo(f"Error: could not find {edit_file}.")
+
+        try:
+            delete_file = f"results/{partner}_delete_links.csv"
+            df_delete = pd.read_csv(delete_file)
+            for _, row in df_delete.iterrows():
+                delete_request(row, partner)
+        except FileNotFoundError:
+            click.echo(f"Error: could not find {delete_file}.")