-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpages.py
More file actions
193 lines (151 loc) · 7.13 KB
/
pages.py
File metadata and controls
193 lines (151 loc) · 7.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import string
import os
from datetime import datetime
from shutil import copytree
from bs4 import BeautifulSoup
class Page:
'''
Class to store and process a single wiki page.
Contains all its data, and operations on the page.
This includes reading a page to the instance, and exporting it to the
correct files.
'''
def __init__(self, path, wiki_name):
self.path = os.path.split(path)[0]
self.filename = os.path.split(path)[1]
self._import_from_file(path, wiki_name)
def convert_filename(self):
'''
Function used to generate the new file name used for export
Returns the new filename
'''
# Generate a nice file name from the title
new_filename = self._format_filename(str(self.title))
return new_filename
def convert_location(self, filename_dict):
'''
Convert the folder names within the location list to the new names, to keep it consistent
'''
new_location = []
for folder in self.location[1:]: # We dont care about the first item, as it is the root
# Convert the folder name if the corresponding name has also been converted,
# otherwise just add the old one
if folder in filename_dict:
new_location.append(filename_dict[folder])
else:
new_location.append(folder)
return new_location
def export(self, new_location, destination_folder, path_dict):
'''
Function to get the content of the page, and export it in the correct form and location
within the destination folder.
Takes a three arguments; the local path within the hierarchy, the destination folder,
and the dictionary containing the mapping between old and new files.
The latter is used to update links.
Creates folders as neccessary within that folder such as to emulate the previous hierarchy.
'''
page_path = os.path.join(destination_folder, new_location)
new_dir = os.path.split(page_path)[0]
# Create all necessary folders
os.makedirs(new_dir, exist_ok=True)
# Open the file for writing
page_file = open(page_path, "w", encoding="utf-8")
# Current date and time in the right format
dt_str = datetime.now().strftime("%Y-%m-%dT%H:%M:%S.000Z")
# Write new metadata and title
page_file.write(
f"""<!--
title: {self.title}
description:
published: true
date: {dt_str}
tags:
editor: ckeditor
dateCreated: {dt_str}
-->
<h1><strong>{self.title}</strong></h1>
""")
# Write content, correcting links and attachments for the new placement.
page_file.write(
self._convert_contents(path_dict, os.path.split(new_location)[0]+"\n")
)
# Write footer
page_file.write(
f"""<p><span class=\"text-tiny\"><i>Autogenerated on {datetime.now().strftime("%Y-%m-%d")} using <a href=\"https://github.com/icl-rocketry/PyConfluenceToWikijs\">1337 h4xx0r 5k111z</a>, from ye olde Confluence Wiki</i></span></p>
<p><span class=\"text-tiny\"><i>Old Metadata: {self._metadata}</i></span></p>
<p><span class=\"text-tiny\"><i>ID: {self.id}</i></span></p>
""")
page_file.close()
# Move media files to a new media folder
media_origin = os.path.join(self.path, "attachments", self.id)
if os.path.isdir(media_origin): # Check if there are any attachments
media_folder = os.path.join(new_dir, "attachments", self.id)
copytree(media_origin, media_folder, dirs_exist_ok=True)
print(f"Exported {self.title} to {new_location}")
def _import_from_file(self, filename, wiki_name):
'''
Function that imports the data from the classes file into the class, by parsing the html
'''
file = open(filename, "r", encoding="utf-8")
file_data = file.read()
file.close()
html_data = BeautifulSoup(file_data, "html.parser")
# Title is prepended with the name of the wiki, plus a " : "; need to strip it
title_chars_to_strip = len(wiki_name)+3
self.title = html_data.title.string[title_chars_to_strip:]
# Location within the hierarchy stored as list of names, going from root down
self.location = []
# Parse the breadcumbs section for the correct hierarchy
hierarchy_section = html_data.find("ol", id="breadcrumbs")
for item in hierarchy_section.find_all("li"):
link_text = item.find("a").get("href")
self.location.append(os.path.splitext(link_text)[0]) # Strip the .html extension
self._content = html_data.find(
"div", class_="wiki-content group",
id="main-content"
)
self._metadata = html_data.find("div", class_ = "page-metadata").get_text().strip()
# Get only the 9 digit ID at the end of the filename before the extension
self.id = os.path.splitext(filename)[0][-9:]
def _convert_contents(self, link_dictionary, curr_dir):
'''
Fix contents and links such that they work within the new wiki.
Writes the updated file links to the class variable,
and returns a string that also converts the links between pages.
'''
rel_media_folder = os.path.join("attachments", self.id)
# Find all the hyperlinks with class "confluence embedded file"
for embedded_file_link in self._content.find_all("a", class_ = "confluence-embedded-file"):
# Get information about the file from the link
file_id = embedded_file_link.get("data-linked-resource-id")
file_alias = embedded_file_link.get("data-linked-resource-default-alias")
file_ext = os.path.splitext(file_alias)[1]
new_location = os.path.join(rel_media_folder, file_id+file_ext)
# Update the link
embedded_file_link.clear()
embedded_file_link.string = file_alias
embedded_file_link["href"] = new_location
print(f"Updated link to {file_alias}")
# Convert links between pages
converted_string = str(self._content.prettify())
for key in link_dictionary:
converted_string = converted_string.replace(
key, os.path.relpath(link_dictionary[key], curr_dir)
)
# Remove the styling from unordered lists such that it plays nicely with wiki.js
converted_string = converted_string.replace(
'<ul style="list-style-type: square;">', '<ul>'
)
return converted_string
@staticmethod
def _format_filename(unformatted_filename):
'''
Take a string and return a valid filename constructed from the string.
Uses a whitelist approach: any characters not present in valid_chars are
removed. Also spaces are replaced with underscores.
Blatantly stolen from this gist: https://gist.github.com/seanh/93666
'''
valid_chars = f"-_() {string.ascii_letters}{string.digits}"
filename = "".join(c for c in unformatted_filename if c in valid_chars)
filename = filename.replace(" ","_") # I don't like spaces in filenames.
return filename