epad-saver/epad-saver.py at master · importantchoice/epad-saver · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/python3

"""
    This script downloads the content of etherpads.

    Padfile layout definition:
    "[TITLE](BASEURL)"

"""

import argparse # Parse CLI arguments
import re # extract pad title from baseurl
from requests import get # download pad content
from datetime import datetime # get timestamp for filename
import os # get filenames for duplicity check

# Default messages for new documents; Used to detect empty pads
welcomeMessages = [
                    "WELCOME TO RISEUP'S ETHERPAD!" # Riseup
                  ]

def getEntryTitle(entry):
    """Extracts pad title from padfile"""
    result = re.search('\[(.*)\]\(.*\)', entry)
    if result:
        return result.group(1)
    else:
        raise Exception("Unable to extract pad title from padfile")


def getEntryURL(entry):
    """Extracts pad url from padfile"""
    result = re.search('\[.*\]\((.*)\)', entry)
    if result:
        return result.group(1)
    else:
        raise Exception("Unable to extract pad url from padfile")


def getTitleFromBaseUrl(baseurl):
    """Extracts the pad title from a given baseurl"""

    result = re.search('http[s]*://.*/(.*)/*', baseurl)
    if result:
        return result.group(1)
    else:
        raise Exception("Unable to extract pad title from baseurl")

def getFileName(workingdir, title):
    """Generates the file name for a downloaded pad dump"""
    # build timestamp which is appended to the file to avoid overwriting an existing dump
    dt = datetime.now()
    timestamp = str(dt.year) + "-" + str(dt.month) + "-" + str(dt.day) + "-" + str(dt.hour) + "-" + str(dt.minute) + "-" + str(dt.second)
    # return filename
    return workingdir + "/" + title + "-" + timestamp + ".txt"

def baseURLcleanup(baseurl):
    """Cleans up the baseurl from unwanted features"""
    return baseurl.rstrip("/")

def contentIsEmpty(content, welcomeMessages):
    """Check if the downloaded pad is empty"""
    content = content.decode('utf-8')

    if content == "":
        return True
    for message in welcomeMessages:
        if content.startswith(message):
            return True

def contentChanged(content, workingdir):
    """Check if downloaded content has changed"""
    for filename in os.listdir(workingdir):
        existingFile = open(workingdir + "/" + filename)
        if content.decode('utf-8') == existingFile.read():
            return False
    return True

# Parse cli arguments
parser = argparse.ArgumentParser(description='Save your Etherpads')
parser.add_argument('-b', '--baseurl', dest='baseurl', help='BaseURL of the pad you like to save')
parser.add_argument('-t', '--title', dest='title', help='Title of the pad')
parser.add_argument('-f', '--padfile', dest='padfile', help='Path to a file containing a list of etherpads')
parser.add_argument('-w', '--workingdir', dest='workingdir', default=".", help='Path to the directory the pad(s) are saved into; Default: .')
parser.add_argument('-s', '--no-duplicate-check', dest='duplicateCheck', action='store_false', help="turn off the duplicate check")
# Sample file generation not implemented yet
# parser.add_argument('-g', '--generate-padfile', dest='genPadFile', help='Generate sample pad file to given destination')
args = parser.parse_args()

workingdir = os.path.abspath(args.workingdir)
duplicateCheck = args.duplicateCheck

# Get urls and titles to download
urls = []

# Get url and title which might be provided
if args.baseurl:
    baseurl = baseURLcleanup(args.baseurl)

    if args.title:
        title = args.title
    else:
        title = getTitleFromBaseUrl(baseurl)
    urls.append([baseurl, title])

if args.padfile:
    with open(args.padfile) as f:
            padfileContent = f.readlines()
            for entry in padfileContent:
                # skip lines which contain no links
                if not re.search('\[.*\]\(.*\)', entry):
                    continue

                title = getEntryTitle(entry)
                baseurl = getEntryURL(entry)

                # cleanup baseurl
                baseurl = baseURLcleanup(baseurl)

                urls.append( [ baseurl, title ] )

# Download urls
for url, title in urls:
    print("Working on " + url)
    # expand url to get the txt version
    url = url + "/export/txt"
    content = get(url).content

    # check if downloaded pad is empty
    if contentIsEmpty(content, welcomeMessages):
        print("  Empty pad detected! Skipping...")
        continue

    # check if content has not changed
    if duplicateCheck and not contentChanged(content, workingdir):
        print("  Content has not changed! Skipping...")
        continue

    # save downloaded content
    filename = getFileName(workingdir, title)
    with open(filename, "wb") as saveFile:
        saveFile.write(content)
        print("  Pad saved under " + filename)