OD-1836 Convert Automated Archive output to working schema (#85)

brianna-dardin · potatoeggy · ariana-paris · web-flow · commit cdb6bd6d08d1 · 2025-03-21T20:29:44.000-07:00
Co-authored-by: eggy &lt;d7chen@uwaterloo.ca&gt;
Co-authored-by: Ariana &lt;ariana-paris@users.noreply.github.com&gt;
diff --git a/.github/workflows/python-app-macos-windows.yml b/.github/workflows/python-app-macos-windows.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ macos-latest, windows-latest ]
+        os: [ macos-13, windows-latest ] # Using macos-13 since macos-latest no longer supports 3.8
 
     steps:
       - uses: actions/checkout@v2
diff --git a/02b-Extract-Tags-From-Stories.py b/02b-Extract-Tags-From-Stories.py
@@ -22,7 +22,6 @@
             args.temp_db_database
         )
     )
-    tags.create_tags_table()
 
     tag_col_list = {}
     stories_id_name = ""
diff --git a/automated_archive/aa.py b/automated_archive/aa.py
@@ -1,10 +1,10 @@
 # -- coding: utf-8 --
 
-import datetime
+from datetime import datetime
 import codecs
 import re
-import os
-from html.parser import HTMLParser
+import html
+import urllib.request
 
 from pymysql import connect
 
@@ -22,11 +22,15 @@ def _clean_file(filepath, log):
     :param filepath: Path to ARCHIVE_DB.pl
     :return: Python dictionary keyed by original story id
     """
-    h = HTMLParser()
-    archive_db = codecs.open(filepath, "r", encoding="utf-8").read()
+    encoding = input(
+        'Encoding for the ARCHIVE_DB.pl file, e.g. "utf-8", "latin_1", "cp1252" (default: "utf-8"): '
+    )
+    if encoding is None or encoding == "":
+        encoding = "utf-8"
+    archive_db = codecs.open(filepath, "r", encoding=encoding).read()
 
     # Manually escape single quote entity and reformat file as a Python dictionary
-    step1 = h.unescape(archive_db.replace("&#39;", "\\&#39;"))
+    step1 = html.unescape(archive_db.replace("&#39;", "\\&#39;"))
 
     # Indent the file with a single tab instead of whatever is currently used
     step15 = re.sub(r"^\s+", "\t", step1)
@@ -122,8 +126,32 @@ def _extract_fandoms(args, record):
     return tags.strip(", ")
 
 
+def _extract_date(args, record, log):
+    date_string = record.get(
+        "PrintTime",
+        record.get(
+            "DatePrint",
+            record.get("Date", str(datetime.now().strftime("%m/%d/%y"))),
+        ),
+    )
+
+    dt = None
+    try:
+        # If the date is in the form of a Unix timestamp
+        if date_string.isdigit():
+            dt = datetime.fromtimestamp(int(date_string))
+        else:
+            dt = datetime.strptime(date_string, "%m/%d/%y")
+    except Exception as e:
+        log.error(
+            f"Failed to parse date value '{date_string}' due to exception: {str(e)}"
+        )
+
+    return dt.strftime("%Y-%m-%d") if dt else ""
+
+
 def _create_mysql(args, FILES, log):
-    db = connect(args.db_host, args.db_user, args.db_password, "")
+    db = connect(host=args.db_host, user=args.db_user, password=args.db_password, db="")
     cursor = db.cursor()
     DATABASE_NAME = args.temp_db_database
 
@@ -132,12 +160,13 @@ def _create_mysql(args, FILES, log):
     cursor.execute("create database {0};".format(DATABASE_NAME))
     cursor.execute("use {0}".format(DATABASE_NAME))
 
-    sql = Sql(args)
-    codepath = os.path.dirname(os.path.realpath(__file__))
+    # Instead of duplicating this file in the repo grab it from the master branch of eFiction
+    url = "https://raw.githubusercontent.com/otwcode/open-doors-eFiction/refs/heads/master/opendoors/open-doors-tables-working.sql"
+    with urllib.request.urlopen(url) as response:
+        script = response.read().decode()
 
-    sql.run_script_from_file(
-        codepath + "/shared_python/create-open-doors-tables.sql", database=DATABASE_NAME
-    )
+    sql = Sql(args, log)
+    sql.run_sql_file(script, database=DATABASE_NAME)
     db.commit()
 
     authors = [
@@ -164,26 +193,17 @@ def _create_mysql(args, FILES, log):
             FILES[i].get("Summary", "").replace("'", "\\'"),
             _extract_tags(args, FILES[i]),
             _extract_characters(args, FILES[i]),
-            datetime.datetime.strptime(
-                FILES[i].get(
-                    "PrintTime",
-                    FILES[i].get(
-                        "DatePrint",
-                        FILES[i].get(
-                            "Date", str(datetime.datetime.now().strftime("%m/%d/%y"))
-                        ),
-                    ),
-                ),
-                "%m/%d/%y",
-            ).strftime("%Y-%m-%d"),
+            _extract_date(args, FILES[i], log),
             FILES[i].get("Location", "").replace("'", "\\'"),
             FILES[i]
             .get("LocationURL", FILES[i].get("StoryURL", ""))
             .replace("'", "\\'"),
             FILES[i].get("Notes", "").replace("'", "\\'"),
             _extract_relationships(args, FILES[i]),
             FILES[i].get("Rating", ""),
-            FILES[i].get("Warnings", "").replace("'", "\\'"),
+            FILES[i]
+            .get("Warnings", FILES[i].get("OptionalWarnings", ""))
+            .replace("'", "\\'"),
             FILES[i].get("Author", "").strip(),
             FILES[i].get("Email", FILES[i].get("EmailAuthor", "")).lower().strip(),
             FILES[i].get("FileType", args.chapters_file_extensions)
@@ -196,6 +216,7 @@ def _create_mysql(args, FILES, log):
 
     cur = 0
     total = len(FILES)
+    item_dict = {}
     for (
         original_id,
         title,
@@ -225,7 +246,7 @@ def _create_mysql(args, FILES, log):
                 table_name = "stories"
             else:
                 filename = url
-                table_name = "bookmarks"
+                table_name = "story_links"
 
             # Clean up fandoms and add default fandom if it exists
             final_fandoms = fandoms.replace("'", r"\'")
@@ -241,10 +262,14 @@ def _create_mysql(args, FILES, log):
                 if element[1] == author and element[2] == email
             ]
             authorid = result[0][0]
+            item_dict[original_id] = {
+                "authorid": authorid,
+                "itemtype": "story_link" if table_name == "story_links" else "story",
+            }
 
             stor = """
-        INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings, author_id)
-        VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}', '{13}');\n""".format(
+        INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings)
+        VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}');\n""".format(
                 table_name,
                 original_id,
                 final_fandoms.replace(r"\\", "\\"),
@@ -258,7 +283,6 @@ def _create_mysql(args, FILES, log):
                 pairings,
                 rating,
                 warnings,
-                authorid,
             )
             cursor.execute(stor)
         except:
@@ -285,6 +309,21 @@ def _create_mysql(args, FILES, log):
             raise
     db.commit()
 
+    for itemid, item_info in item_dict.items():
+        try:
+            item_auth = """
+            INSERT INTO item_authors (author_id, item_id, item_type)
+            VALUES({0}, {1}, '{2}');\n""".format(
+                item_info["authorid"], itemid, item_info["itemtype"]
+            )
+            cursor.execute(item_auth)
+        except:
+            log.error(
+                f"Failed to insert item_authors for {item_info['itemtype']} {itemid} with author {item_info['authorid']}"
+            )
+            raise
+    db.commit()
+
 
 def clean_and_load_data(args, log):
     data = _clean_file(args.db_input_file, log)
diff --git a/shared_python/Chapters.py b/shared_python/Chapters.py
@@ -81,7 +81,8 @@ def _gather_and_dedupe(self, chapters_path, extensions, has_ids=False):
                 for cid, duplicate in duplicate_chapters.items():
                     # look up the author id and add that one to the file_names list
                     sql_author_id = self.sql.execute_and_fetchall(
-                        "SELECT author_id FROM chapters WHERE id = {0}".format(cid)
+                        self.sql.database,
+                        "SELECT author_id FROM chapters WHERE id = {0}".format(cid),
                     )
                     if len(sql_author_id) > 0:
                         author_id = sql_author_id[0][0]
@@ -142,6 +143,8 @@ def populate_chapters(self, folder=None, extensions=None):
         else:
             for _, chapter_path in file_paths.items():
                 path = chapter_path.replace(self.args.chapters_path, "")[1:]
+                if os.sep == "\\":  # if this script is run on windows
+                    path = path.replace("\\", "/")
                 with codecs.open(chapter_path, "r", encoding=char_encoding) as c:
                     try:
                         cur = Common.print_progress(cur, total)
diff --git a/shared_python/Sql.py b/shared_python/Sql.py
@@ -58,7 +58,9 @@ def run_script_from_file(self, filename, database, initial_load=False):
         fd = open(filename, "r")
         sqlFile = fd.read()
         fd.close()
+        self.run_sql_file(sqlFile, database, initial_load)
 
+    def run_sql_file(self, sqlFile, database, initial_load=False):
         # replace placeholders and return all SQL commands (split on ';')
         sqlCommands = sqlFile.replace("$DATABASE$", database).split(";\n")
 
diff --git a/shared_python/Tags.py b/shared_python/Tags.py
@@ -1,4 +1,5 @@
 import re
+from collections import defaultdict
 from html.parser import HTMLParser
 from logging import Logger
 
@@ -83,8 +84,9 @@ def populate_tag_table(
             )
         )
 
+        tags_to_insert = {}
+        tags_to_story_ids = defaultdict(list)
         for story_tags_row in data:
-            values = []
             for col in tag_columns:
                 needs_fandom = col in tags_with_fandoms
                 if story_tags_row[col] is not None:
@@ -93,27 +95,42 @@ def populate_tag_table(
                             if isinstance(
                                 tag_col_lookup[col], str
                             ):  # Probably AA or a custom archive
-                                cleaned_tag = (
-                                    val.encode("utf-8").replace("'", "'").strip()
+                                cleaned_tag = re.sub(
+                                    r'(?<!\\)"',
+                                    '\\"',
+                                    val.replace("'", "'").strip(),
                                 )
-
-                                values.append(
-                                    '({0}, "{1}", "{2}", "{3}")'.format(
-                                        story_tags_row[story_id_col_name],
-                                        re.sub(r'(?<!\\)"', '\\"', cleaned_tag),
-                                        tag_col_lookup[col],
-                                        story_tags_row["fandoms"]
-                                        if needs_fandom
-                                        else "",
-                                    )
+                                tags_to_story_ids[cleaned_tag].append(
+                                    story_tags_row[story_id_col_name]
+                                )
+                                tags_to_insert[
+                                    cleaned_tag
+                                ] = '("{0}", "{1}", "{2}")'.format(
+                                    cleaned_tag,
+                                    tag_col_lookup[col],
+                                    story_tags_row["fandoms"] if needs_fandom else "",
                                 )
 
-            if len(values) > 0:
-                self.sql.execute(
-                    """
-               INSERT INTO tags (storyid, original_tag, original_table, ao3_tag_fandom) VALUES {0}
-             """.format(", ".join(values))
-                )
+        if len(tags_to_insert) > 0:
+            self.sql.execute(
+                """
+           INSERT INTO tags (original_tag, original_type, ao3_tag_fandom) VALUES {0}
+         """.format(", ".join(tags_to_insert.values()))
+            )
+
+            tag_data = self.sql.execute_dict("SELECT id, original_tag FROM tags")
+            for tag_row in tag_data:
+                story_ids = set(tags_to_story_ids[tag_row["original_tag"]])
+                for story_id in story_ids:
+                    self.sql.execute(
+                        """
+                    INSERT INTO item_tags (item_id, item_type, tag_id) VALUES ({0}, "{1}", {2})
+                    """.format(
+                            story_id,
+                            "story_link" if table_name == "story_links" else "story",
+                            tag_row["id"],
+                        )
+                    )
 
     def distinct_tags(self, database):
         """

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,6 @@`
`22`	`22`	`args.temp_db_database`
`23`	`23`	`)`
`24`	`24`	`)`
`25`		`- tags.create_tags_table()`
`26`	`25`
`27`	`26`	`tag_col_list = {}`
`28`	`27`	`stories_id_name = ""`