Skip to content

Commit cdb6bd6

Browse files
brianna-dardinpotatoeggyariana-paris
authored
OD-1836 Convert Automated Archive output to working schema (#85)
Co-authored-by: eggy <[email protected]> Co-authored-by: Ariana <[email protected]>
1 parent dd5b73c commit cdb6bd6

File tree

6 files changed

+111
-51
lines changed

6 files changed

+111
-51
lines changed

.github/workflows/python-app-macos-windows.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
runs-on: ${{ matrix.os }}
1616
strategy:
1717
matrix:
18-
os: [ macos-latest, windows-latest ]
18+
os: [ macos-13, windows-latest ] # Using macos-13 since macos-latest no longer supports 3.8
1919

2020
steps:
2121
- uses: actions/checkout@v2

02b-Extract-Tags-From-Stories.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
args.temp_db_database
2323
)
2424
)
25-
tags.create_tags_table()
2625

2726
tag_col_list = {}
2827
stories_id_name = ""

automated_archive/aa.py

Lines changed: 68 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# -- coding: utf-8 --
22

3-
import datetime
3+
from datetime import datetime
44
import codecs
55
import re
6-
import os
7-
from html.parser import HTMLParser
6+
import html
7+
import urllib.request
88

99
from pymysql import connect
1010

@@ -22,11 +22,15 @@ def _clean_file(filepath, log):
2222
:param filepath: Path to ARCHIVE_DB.pl
2323
:return: Python dictionary keyed by original story id
2424
"""
25-
h = HTMLParser()
26-
archive_db = codecs.open(filepath, "r", encoding="utf-8").read()
25+
encoding = input(
26+
'Encoding for the ARCHIVE_DB.pl file, e.g. "utf-8", "latin_1", "cp1252" (default: "utf-8"): '
27+
)
28+
if encoding is None or encoding == "":
29+
encoding = "utf-8"
30+
archive_db = codecs.open(filepath, "r", encoding=encoding).read()
2731

2832
# Manually escape single quote entity and reformat file as a Python dictionary
29-
step1 = h.unescape(archive_db.replace("&#39;", "\\&#39;"))
33+
step1 = html.unescape(archive_db.replace("&#39;", "\\&#39;"))
3034

3135
# Indent the file with a single tab instead of whatever is currently used
3236
step15 = re.sub(r"^\s+", "\t", step1)
@@ -122,8 +126,32 @@ def _extract_fandoms(args, record):
122126
return tags.strip(", ")
123127

124128

129+
def _extract_date(args, record, log):
130+
date_string = record.get(
131+
"PrintTime",
132+
record.get(
133+
"DatePrint",
134+
record.get("Date", str(datetime.now().strftime("%m/%d/%y"))),
135+
),
136+
)
137+
138+
dt = None
139+
try:
140+
# If the date is in the form of a Unix timestamp
141+
if date_string.isdigit():
142+
dt = datetime.fromtimestamp(int(date_string))
143+
else:
144+
dt = datetime.strptime(date_string, "%m/%d/%y")
145+
except Exception as e:
146+
log.error(
147+
f"Failed to parse date value '{date_string}' due to exception: {str(e)}"
148+
)
149+
150+
return dt.strftime("%Y-%m-%d") if dt else ""
151+
152+
125153
def _create_mysql(args, FILES, log):
126-
db = connect(args.db_host, args.db_user, args.db_password, "")
154+
db = connect(host=args.db_host, user=args.db_user, password=args.db_password, db="")
127155
cursor = db.cursor()
128156
DATABASE_NAME = args.temp_db_database
129157

@@ -132,12 +160,13 @@ def _create_mysql(args, FILES, log):
132160
cursor.execute("create database {0};".format(DATABASE_NAME))
133161
cursor.execute("use {0}".format(DATABASE_NAME))
134162

135-
sql = Sql(args)
136-
codepath = os.path.dirname(os.path.realpath(__file__))
163+
# Instead of duplicating this file in the repo grab it from the master branch of eFiction
164+
url = "https://raw.githubusercontent.com/otwcode/open-doors-eFiction/refs/heads/master/opendoors/open-doors-tables-working.sql"
165+
with urllib.request.urlopen(url) as response:
166+
script = response.read().decode()
137167

138-
sql.run_script_from_file(
139-
codepath + "/shared_python/create-open-doors-tables.sql", database=DATABASE_NAME
140-
)
168+
sql = Sql(args, log)
169+
sql.run_sql_file(script, database=DATABASE_NAME)
141170
db.commit()
142171

143172
authors = [
@@ -164,26 +193,17 @@ def _create_mysql(args, FILES, log):
164193
FILES[i].get("Summary", "").replace("'", "\\'"),
165194
_extract_tags(args, FILES[i]),
166195
_extract_characters(args, FILES[i]),
167-
datetime.datetime.strptime(
168-
FILES[i].get(
169-
"PrintTime",
170-
FILES[i].get(
171-
"DatePrint",
172-
FILES[i].get(
173-
"Date", str(datetime.datetime.now().strftime("%m/%d/%y"))
174-
),
175-
),
176-
),
177-
"%m/%d/%y",
178-
).strftime("%Y-%m-%d"),
196+
_extract_date(args, FILES[i], log),
179197
FILES[i].get("Location", "").replace("'", "\\'"),
180198
FILES[i]
181199
.get("LocationURL", FILES[i].get("StoryURL", ""))
182200
.replace("'", "\\'"),
183201
FILES[i].get("Notes", "").replace("'", "\\'"),
184202
_extract_relationships(args, FILES[i]),
185203
FILES[i].get("Rating", ""),
186-
FILES[i].get("Warnings", "").replace("'", "\\'"),
204+
FILES[i]
205+
.get("Warnings", FILES[i].get("OptionalWarnings", ""))
206+
.replace("'", "\\'"),
187207
FILES[i].get("Author", "").strip(),
188208
FILES[i].get("Email", FILES[i].get("EmailAuthor", "")).lower().strip(),
189209
FILES[i].get("FileType", args.chapters_file_extensions)
@@ -196,6 +216,7 @@ def _create_mysql(args, FILES, log):
196216

197217
cur = 0
198218
total = len(FILES)
219+
item_dict = {}
199220
for (
200221
original_id,
201222
title,
@@ -225,7 +246,7 @@ def _create_mysql(args, FILES, log):
225246
table_name = "stories"
226247
else:
227248
filename = url
228-
table_name = "bookmarks"
249+
table_name = "story_links"
229250

230251
# Clean up fandoms and add default fandom if it exists
231252
final_fandoms = fandoms.replace("'", r"\'")
@@ -241,10 +262,14 @@ def _create_mysql(args, FILES, log):
241262
if element[1] == author and element[2] == email
242263
]
243264
authorid = result[0][0]
265+
item_dict[original_id] = {
266+
"authorid": authorid,
267+
"itemtype": "story_link" if table_name == "story_links" else "story",
268+
}
244269

245270
stor = """
246-
INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings, author_id)
247-
VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}', '{13}');\n""".format(
271+
INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings)
272+
VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}');\n""".format(
248273
table_name,
249274
original_id,
250275
final_fandoms.replace(r"\\", "\\"),
@@ -258,7 +283,6 @@ def _create_mysql(args, FILES, log):
258283
pairings,
259284
rating,
260285
warnings,
261-
authorid,
262286
)
263287
cursor.execute(stor)
264288
except:
@@ -285,6 +309,21 @@ def _create_mysql(args, FILES, log):
285309
raise
286310
db.commit()
287311

312+
for itemid, item_info in item_dict.items():
313+
try:
314+
item_auth = """
315+
INSERT INTO item_authors (author_id, item_id, item_type)
316+
VALUES({0}, {1}, '{2}');\n""".format(
317+
item_info["authorid"], itemid, item_info["itemtype"]
318+
)
319+
cursor.execute(item_auth)
320+
except:
321+
log.error(
322+
f"Failed to insert item_authors for {item_info['itemtype']} {itemid} with author {item_info['authorid']}"
323+
)
324+
raise
325+
db.commit()
326+
288327

289328
def clean_and_load_data(args, log):
290329
data = _clean_file(args.db_input_file, log)

shared_python/Chapters.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,8 @@ def _gather_and_dedupe(self, chapters_path, extensions, has_ids=False):
8181
for cid, duplicate in duplicate_chapters.items():
8282
# look up the author id and add that one to the file_names list
8383
sql_author_id = self.sql.execute_and_fetchall(
84-
"SELECT author_id FROM chapters WHERE id = {0}".format(cid)
84+
self.sql.database,
85+
"SELECT author_id FROM chapters WHERE id = {0}".format(cid),
8586
)
8687
if len(sql_author_id) > 0:
8788
author_id = sql_author_id[0][0]
@@ -142,6 +143,8 @@ def populate_chapters(self, folder=None, extensions=None):
142143
else:
143144
for _, chapter_path in file_paths.items():
144145
path = chapter_path.replace(self.args.chapters_path, "")[1:]
146+
if os.sep == "\\": # if this script is run on windows
147+
path = path.replace("\\", "/")
145148
with codecs.open(chapter_path, "r", encoding=char_encoding) as c:
146149
try:
147150
cur = Common.print_progress(cur, total)

shared_python/Sql.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,9 @@ def run_script_from_file(self, filename, database, initial_load=False):
5858
fd = open(filename, "r")
5959
sqlFile = fd.read()
6060
fd.close()
61+
self.run_sql_file(sqlFile, database, initial_load)
6162

63+
def run_sql_file(self, sqlFile, database, initial_load=False):
6264
# replace placeholders and return all SQL commands (split on ';')
6365
sqlCommands = sqlFile.replace("$DATABASE$", database).split(";\n")
6466

shared_python/Tags.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import re
2+
from collections import defaultdict
23
from html.parser import HTMLParser
34
from logging import Logger
45

@@ -83,8 +84,9 @@ def populate_tag_table(
8384
)
8485
)
8586

87+
tags_to_insert = {}
88+
tags_to_story_ids = defaultdict(list)
8689
for story_tags_row in data:
87-
values = []
8890
for col in tag_columns:
8991
needs_fandom = col in tags_with_fandoms
9092
if story_tags_row[col] is not None:
@@ -93,27 +95,42 @@ def populate_tag_table(
9395
if isinstance(
9496
tag_col_lookup[col], str
9597
): # Probably AA or a custom archive
96-
cleaned_tag = (
97-
val.encode("utf-8").replace("'", "'").strip()
98+
cleaned_tag = re.sub(
99+
r'(?<!\\)"',
100+
'\\"',
101+
val.replace("'", "'").strip(),
98102
)
99-
100-
values.append(
101-
'({0}, "{1}", "{2}", "{3}")'.format(
102-
story_tags_row[story_id_col_name],
103-
re.sub(r'(?<!\\)"', '\\"', cleaned_tag),
104-
tag_col_lookup[col],
105-
story_tags_row["fandoms"]
106-
if needs_fandom
107-
else "",
108-
)
103+
tags_to_story_ids[cleaned_tag].append(
104+
story_tags_row[story_id_col_name]
105+
)
106+
tags_to_insert[
107+
cleaned_tag
108+
] = '("{0}", "{1}", "{2}")'.format(
109+
cleaned_tag,
110+
tag_col_lookup[col],
111+
story_tags_row["fandoms"] if needs_fandom else "",
109112
)
110113

111-
if len(values) > 0:
112-
self.sql.execute(
113-
"""
114-
INSERT INTO tags (storyid, original_tag, original_table, ao3_tag_fandom) VALUES {0}
115-
""".format(", ".join(values))
116-
)
114+
if len(tags_to_insert) > 0:
115+
self.sql.execute(
116+
"""
117+
INSERT INTO tags (original_tag, original_type, ao3_tag_fandom) VALUES {0}
118+
""".format(", ".join(tags_to_insert.values()))
119+
)
120+
121+
tag_data = self.sql.execute_dict("SELECT id, original_tag FROM tags")
122+
for tag_row in tag_data:
123+
story_ids = set(tags_to_story_ids[tag_row["original_tag"]])
124+
for story_id in story_ids:
125+
self.sql.execute(
126+
"""
127+
INSERT INTO item_tags (item_id, item_type, tag_id) VALUES ({0}, "{1}", {2})
128+
""".format(
129+
story_id,
130+
"story_link" if table_name == "story_links" else "story",
131+
tag_row["id"],
132+
)
133+
)
117134

118135
def distinct_tags(self, database):
119136
"""

0 commit comments

Comments
 (0)