PyConfluenceToWikijs/pages.py at main · icl-rocketry/PyConfluenceToWikijs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import string
import os
from datetime import datetime
from shutil import copytree
from bs4 import BeautifulSoup

class Page:
    '''
    Class to store and process a single wiki page.

    Contains all its data, and operations on the page.
    This includes reading a page to the instance, and exporting it to the
    correct files.
    '''

    def __init__(self, path, wiki_name):
        self.path = os.path.split(path)[0]
        self.filename = os.path.split(path)[1]
        self._import_from_file(path, wiki_name)

    def convert_filename(self):
        '''
        Function used to generate the new file name used for export

        Returns the new filename
        '''

        # Generate a nice file name from the title
        new_filename = self._format_filename(str(self.title))

        return new_filename

    def convert_location(self, filename_dict):
        '''
        Convert the folder names within the location list to the new names, to keep it consistent
        '''
        new_location = []

        for folder in self.location[1:]: # We dont care about the first item, as it is the root
            # Convert the folder name if the corresponding name has also been converted,
            # otherwise just add the old one
            if folder in filename_dict:
                new_location.append(filename_dict[folder])
            else:
                new_location.append(folder)
        return new_location

    def export(self, new_location, destination_folder, path_dict):
        '''
        Function to get the content of the page, and export it in the correct form and location
        within the destination folder.

        Takes a three arguments; the local path within the hierarchy, the destination folder,
        and the dictionary containing the mapping between old and new files.
        The latter is used to update links.
        Creates folders as neccessary within that folder such as to emulate the previous hierarchy.
        '''
        page_path = os.path.join(destination_folder, new_location)
        new_dir = os.path.split(page_path)[0]

        # Create all necessary folders
        os.makedirs(new_dir, exist_ok=True)

        # Open the file for writing
        page_file = open(page_path, "w", encoding="utf-8")

        # Current date and time in the right format
        dt_str = datetime.now().strftime("%Y-%m-%dT%H:%M:%S.000Z")

        # Write new metadata and title
        page_file.write(
f"""<!--
title: {self.title}
description:
published: true
date: {dt_str}
tags:
editor: ckeditor
dateCreated: {dt_str}
-->


<h1><strong>{self.title}</strong></h1>

""")

        # Write content, correcting links and attachments for the new placement.
        page_file.write(
            self._convert_contents(path_dict, os.path.split(new_location)[0]+"\n")
        )

        # Write footer
        page_file.write(
f"""<p><span class=\"text-tiny\"><i>Autogenerated on {datetime.now().strftime("%Y-%m-%d")} using <a href=\"https://github.com/icl-rocketry/PyConfluenceToWikijs\">1337 h4xx0r 5k111z</a>, from ye olde Confluence Wiki</i></span></p>
<p><span class=\"text-tiny\"><i>Old Metadata: {self._metadata}</i></span></p>
<p><span class=\"text-tiny\"><i>ID: {self.id}</i></span></p>
""")

        page_file.close()

        # Move media files to a new media folder
        media_origin = os.path.join(self.path, "attachments", self.id)
        if os.path.isdir(media_origin): # Check if there are any attachments
            media_folder = os.path.join(new_dir, "attachments", self.id)
            copytree(media_origin, media_folder, dirs_exist_ok=True)

        print(f"Exported {self.title} to {new_location}")

    def _import_from_file(self, filename, wiki_name):
        '''
        Function that imports the data from the classes file into the class, by parsing the html
        '''
        file = open(filename, "r", encoding="utf-8")
        file_data = file.read()
        file.close()

        html_data = BeautifulSoup(file_data, "html.parser")

        # Title is prepended with the name of the wiki, plus a " : "; need to strip it
        title_chars_to_strip = len(wiki_name)+3
        self.title = html_data.title.string[title_chars_to_strip:]

        # Location within the hierarchy stored as list of names, going from root down
        self.location = []

        # Parse the breadcumbs section for the correct hierarchy
        hierarchy_section = html_data.find("ol", id="breadcrumbs")
        for item in hierarchy_section.find_all("li"):
            link_text = item.find("a").get("href")
            self.location.append(os.path.splitext(link_text)[0]) # Strip the .html extension

        self._content = html_data.find(
            "div", class_="wiki-content group",
            id="main-content"
            )

        self._metadata = html_data.find("div", class_ = "page-metadata").get_text().strip()

        # Get only the 9 digit ID at the end of the filename before the extension
        self.id = os.path.splitext(filename)[0][-9:]

    def _convert_contents(self, link_dictionary, curr_dir):
        '''
        Fix contents and links such that they work within the new wiki.
        Writes the updated file links to the class variable,
        and returns a string that also converts the links between pages.
        '''

        rel_media_folder = os.path.join("attachments", self.id)

        # Find all the hyperlinks with class "confluence embedded file"
        for embedded_file_link in self._content.find_all("a", class_ = "confluence-embedded-file"):
            # Get information about the file from the link
            file_id = embedded_file_link.get("data-linked-resource-id")
            file_alias = embedded_file_link.get("data-linked-resource-default-alias")

            file_ext = os.path.splitext(file_alias)[1]
            new_location = os.path.join(rel_media_folder, file_id+file_ext)

            # Update the link
            embedded_file_link.clear()
            embedded_file_link.string = file_alias
            embedded_file_link["href"] = new_location

            print(f"Updated link to {file_alias}")

        # Convert links between pages
        converted_string = str(self._content.prettify())
        for key in link_dictionary:
            converted_string = converted_string.replace(
                key, os.path.relpath(link_dictionary[key], curr_dir)
                )

        # Remove the styling from unordered lists such that it plays nicely with wiki.js
        converted_string = converted_string.replace(
            '<ul style="list-style-type: square;">', '<ul>'
            )

        return converted_string

    @staticmethod
    def _format_filename(unformatted_filename):
        '''
        Take a string and return a valid filename constructed from the string.
        Uses a whitelist approach: any characters not present in valid_chars are
        removed. Also spaces are replaced with underscores.

        Blatantly stolen from this gist: https://gist.github.com/seanh/93666
        '''
        valid_chars = f"-_() {string.ascii_letters}{string.digits}"
        filename = "".join(c for c in unformatted_filename if c in valid_chars)
        filename = filename.replace(" ","_") # I don't like spaces in filenames.
        return filename