CanvasFS/canvasfs.py at master · jmbjorndalen/CanvasFS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
#!/usr/bin/env python3
"""
Fuse filesystem for exploring assignments and handins.

Metadata for a given level in a hierarchy is included as .meta files (json format).

Files are downloaded from Canvas when opened in the filesystem. Cached files are stored in CACHE_DIR (default .cache).

Zip files / archives
------
ZipFiles will be automatically mounted as '<pathname>.unp' once they are cached locally. This means that
once you try to read a zip file, it will be available as an unpacked directory locally.

The reason for not providing any 'unzip' directory before downloading the file is that this could cause
accidental download of all zip files if a 'find' or another tool tried to traverse the unzip directories.

NB: canvasfs now uses libarchive-c to unpack various archive formats.
- https://github.com/Changaco/python-libarchive-c
- ubuntu: sudo apt install python3-libarchive-c


TODO
----
- clean up class hierarchy a bit to make the zip files less kludgy.
- safer handling of file types for detecting zip files.
- possibility of using 'touch' or some other method for downloading an assignment without reading the files?
- configurable cache directory.
- rm on a file: remove from cache.
  We may not need to remove zip files that are already mounted from memory even if we remove the file from the cache directory.
  - or maybe just "unload" ?
  - files may not need to be loaded or scanned until somebody descends into the .unp directory.
  - could also use rmdir on .unp directory to remove unpacked directory. Might need a functools.cache variant with
    weakref in that case.
- archives inside archives. (students that submit a tarball inside a zip because canvas refuses to accept tarballs)
"""

import logging
from errno import ENOENT
from stat import S_IFDIR, S_IFREG
from time import time
from pathlib import Path
from collections import defaultdict
import functools
import datetime
import os
import io
import json
import urllib.request
from fuse import FUSE, FuseOSError, Operations, LoggingMixIn
import zipfile
import libarchive

CACHE_DIR = ".cache"

DEBUG = False
# LOG_LEVEL = logging.DEBUG
LOG_LEVEL = logging.ERROR

# Get running user's uid/gid and reuse it for the filesystem
fs_uid = os.getuid()
fs_gid = os.getgid()


def filter_dict(d, remove_keys):
    """Returns a new dict with all key:values except the ones in remove_keys"""
    return {k : v for k, v in d.items() if k not in remove_keys}


@functools.cache
def ddmcache(file):
    """Used to dedup (cache and re-use) files from archives.
    Sometimes students submit as groups, submit the same archives multiple files (resubmissions)
    or many students include the same files (pre-code, documentation, assignment information etc).
    """
    return file


def merge_paths(p1, p2):
    """Merges paths to form p1/p2.
    Some safeguards against the pathlib problem where Path("a") / Path("/b") becomes Path("/b")
    (documented, but still not the needed behaviour here).
    Mainly guarding against tar and zip files where the file paths start with ./ or /.
    """
    pp1 = Path(p1)
    pp2 = Path(p2)   #  takes care of ./ prefixes.
    while len(pp2.parts) > 0 and pp2.parts[0] in ("/", ".."):
        pp2 = Path(*pp2.parts[1:])

    if len(pp2.parts) == 0:
        print(f"WARNING: merge of '{p1}' and '{p2}' stripped down second part to nothing")
    return str(pp1 / pp2)


class Entry:
    def __init__(self, pathname, cont, time_entry=None):
        """cont : dict with
        - timestamp in cont[time_entry],
        - necessary for files:
          - id,
          - url
          - size.
        time_entry = entry name in cont for picking up the entry timestamp
        """
        self.cont = cont
        self.pathname = pathname
        p = Path(pathname)
        self.parent = str(p.parent)
        self.fname = str(p.name)
        if (dts := cont.get(time_entry, None)) is not None:
            dt = datetime.datetime.strptime(dts, '%Y-%m-%dT%H:%M:%SZ')
            self.time = dt.timestamp()
        else:
            self.time = cont.get('_time', 0)
        self.size = self.cont.get('size', 0)

    # Not all entries (like ZipEntry files) will have fid and url, so compute them at runtime
    @property
    def fid(self):
        return self.cont['id']

    @property
    def url(self):
        return self.cont['url']

    def _cache_path(self):
        return f"{CACHE_DIR}/{self.fid}"

    def _is_cached(self):
        cpath = self._cache_path()
        return os.path.exists(cpath)

    def _open_file(self):
        """Opens the locally cached file. Downloads the file first if it is not in the cache already.
        Returns the opened file."""
        cpath = self._cache_path()
        if not self._is_cached():
            # Need to fetch the file first
            r = urllib.request.urlopen(self.url)
            if r.status == 200:
                data = r.read()
                with open(cpath, 'wb') as f:
                    f.write(data)
            else:
                logging.log(logging.DEBUG, f"TODO: check results from reading file {self.fid} {self.url} {r.status}")
                raise RuntimeError("Could not get file")
        return open(cpath, 'rb')

    def read(self, size, offset):
        """Reads a chunk from a file (potentially downloading and cacheing the file if necessary)."""
        with self._open_file() as f:
            f.seek(offset)
            return f.read(size)

    def getattr(self):
        return dict(st_mode=(S_IFREG | 0o444),
                    st_size=self.size,
                    st_blocks=(self.size + 511) // 512,  # For du etc.
                    st_uid=fs_uid,
                    st_gid=fs_gid,
                    st_ctime=self.time,
                    st_mtime=self.time,
                    st_atime=self.time)

    def __repr__(self):
        return f'<{self.__class__.__name__}: {self.pathname}>'


class DirEntry(Entry):
    def __init__(self, pathname, cont, time_entry=None):
        super().__init__(pathname, cont, time_entry=time_entry)

    def getattr(self):
        return dict(st_mode=(S_IFDIR | 0o555),
                    st_nlink=2,
                    st_uid=fs_uid,
                    st_gid=fs_gid,
                    st_ctime=self.time,
                    st_mtime=self.time,
                    st_atime=self.time)


class MetaEntry(Entry):
    """Provide a human readable version of the metadata with prettified .json files added as .meta files in directories."""
    def __init__(self, pathname, cont, time_entry=None, filter_entries=None):
        d = cont if filter_entries is None else filter_dict(cont, filter_entries)
        super().__init__(pathname + "/.meta", d, time_entry=time_entry)
        self.meta_str = (json.dumps(cont, sort_keys=True, indent=4) + "\n").encode('utf-8')
        self.time = time()
        self.size = len(self.meta_str)

    def read(self, size, offset):
        """Reads a chunk from a file (potentially downloading and cacheing the file if necessary)."""
        start = offset
        end  = offset + size
        return self.meta_str[start:end]


class DebugEntry(Entry):
    DEBUG_FILE = "/.debuginfo.json"
    """A debug file that provides json data about the current mounted filesystem"""
    def __init__(self, pathname=None, cont=None, time_entry=None, filter_entries=None):
        d = {}
        super().__init__(self.DEBUG_FILE, d, time_entry=time_entry)
        self._update_str()
        self.time = time()
        self.size = len(self.meta_str)

    def _update_str(self):
        self.meta_str = (json.dumps({'unzipped_files' : ZipEntry.debuglst}, sort_keys=True, indent=4) + "\n").encode('utf-8')

    def read(self, size, offset):
        """Reads a chunk from a file (potentially downloading and cacheing the file if necessary)."""
        start = offset
        end  = offset + size
        return self.meta_str[start:end]


# ###### Zip Files / archives ######################
# TODO: kludgy, but let's figure out how to do this before cleaning it up.

class ZipFileEntry(Entry):
    """Keeps unpacked files in memory.
    Reading from archives (especially tar files) could be very slow, so this is a speed vs. memory opt.
    """
    def __init__(self, path, info, data):
        # A little bit of band-aid. Should modify the hierarchy further up.
        # Need to pick from info object
        super().__init__(path, info)
        self._data = ddmcache(data)
        self.size = len(self._data)

    def read(self, size, offset):
        return self._data[offset:offset + size]


class ZipDirEntry(DirEntry):
    def __init__(self, path, info):
        if path.endswith("/"):
            # Remove trailing slash
            path = path[:-1]
        super().__init__(path, info)
        logging.log(logging.DEBUG, f"ZipDirEntry {path}")


class ZipEntry(Entry):
    debuglst = []

    def __init__(self, pathname, cont, ctx, time_entry=None):
        super().__init__(pathname, cont, time_entry=time_entry)
        self.is_unpacked = False
        self.ctx = ctx
        self.check_unpack()

    def read_entry(self, entry):
        """Reads the file contents from the entry"""
        bio = io.BytesIO()
        for block in entry.get_blocks():
            bio.write(block)
        bio.seek(0)
        return bio.read()

    def check_unpack(self):
        if self.is_unpacked or (not auto_unpack) or (not self._is_cached()):
            # not ready for auto_unpack, already unpacked, or can't unpack if not cached
            return
        try:
            cpath = self._cache_path()
            with libarchive.file_reader(cpath) as zf:
                # Some zipfiles don't include subdirectory entries (only direct paths to files).
                # This will be handled in add_entry.
                dir_prefix = self.pathname + ".unp"  # the pathname of the unpack directory
                print("Unpacking ", dir_prefix)
                # add the root/mount point
                self.ctx.add_entry(ZipDirEntry(dir_prefix, {'_time': self.time}))
                # add each of the directories and files listed in the zip file.
                for entry in zf:
                    path = merge_paths(dir_prefix, entry.pathname)    # f"{dir_prefix}/{entry.pathname}"
                    info = {"_time": max(t for t in (entry.ctime, entry.mtime) if t is not None)}
                    if entry.isdir:
                        self.ctx.add_entry(ZipDirEntry(path, info))
                    elif entry.isreg:
                        # Regular file
                        self.debuglst.append(path)
                        self.ctx.add_entry(ZipFileEntry(path, info, self.read_entry(entry)))
                    else:
                        if entry.issym:
                            print(f"NB (ZipEntry): skipping symbolic link: {path}")
                        else:
                            print(f"WARNING: ZipEntry: {path} is of unhandled file type {entry.filetype} {entry.issym=}")
            self.is_unpacked = True
        except zipfile.BadZipFile:
            # TODO: this exception is from zipFile and will probably never be thrown by libarchive.
            print(f"Failed to open {self.pathname} ({cpath}) - bad zipfile")

    def read(self, size, offset):
        # TODO: some larger files are very slow to read using this. Consider using an lru_cache for the file contents?
        # Could be a side effect of buffer size in 'wc'...
        #
        # Need a bit of extra magic to trigger downloading and unpacking archive files.
        # Reading data first ensures that the file is cached locally, it can then be unpacked.
        data = super().read(size, offset)
        self.check_unpack()
        return data

    @classmethod
    def possible_archive(self, fpath):
        lfpath = fpath.lower()
        # TODO: a safer option could be to try to open the file with libarchive.
        # libarchive does not appear to provide a function that checks if it may be an archive (like magic keys)
        return any(lfpath.endswith(end) for end in
                   ['.zip', '.rar', '.tar.gz', '.tgz', '.tar', '.7z'])


# Some of this class is based on the Context example from the fusepy distribution.
class Context(LoggingMixIn, Operations):
    'Provides the main filesystem functionality and keeps tracks of files and directories.'
    # Disable unused operations:
    access = None
    flush = None
    getxattr = None
    listxattr = None
    open = None
    opendir = None
    release = None
    releasedir = None
    statfs = None

    def __init__(self):
        # dirs is used to keep track of files and subdirectories in each directory.
        # files are each file/directory in the filesystem with an Entry object for each file (key = path).
        super().__init__()
        self.dirs = defaultdict(list)
        self.files = {}

    def getattr(self, path, fh=None):
        # uid, gid, pid = fuse_get_context()
        if (entry := self.files.get(path, None)):
            return entry.getattr()
        raise FuseOSError(ENOENT)

    def read(self, path, size, offset, fh):
        # logging.log(logging.DEBUG, f"**read**({path}, {size}, {offset}, {fh})")
        if path in self.files:
            e = self.files[path]
            return e.read(size, offset)
        raise RuntimeError('unexpected path: %r' % path)

    def readdir(self, path, fh):
        # logging.log(logging.DEBUG, f"readdir: {path} {[d.fname for d in dirs.get(path, [])]}")
        return [d.fname for d in self.dirs.get(path, [])]

    def _add_file(self, fn, entry):
        """Adds a file and make sure it's seen in the parent/directory."""
        if fn in self.files:
            # This typically happens if zip or rar files have an entry for a subdirectory after
            # files contained in that subdirectory.
            if not isinstance(entry, DirEntry):
                print(f"WARNING: {fn} already exists in the file list. {type(entry)}.")
            return
        self.files[fn] = entry
        # Make sure the file is also seen in the parent directory
        self._add_dirent(entry.parent, entry)

    def _add_dirent(self, dpath, entry):
        """Adds an entry to the directory it's contained in.
        Also ensures that there is an entry for the directory in "files".
        """
        if dpath == entry.pathname:
            # TODO: too sleepy now, but it looks like "/" is added for every file or subdirectory of '/', which makes sense.
            # Probably, this should be considered a special case where the root is updated with the timestamp of the most recent
            # of the child nodes.
            if dpath != "/":
                print(f"WARNING: trying to add directory to itself {dpath} {entry}")
                print(self.dirs.get("/"))
                print(self.files.get("/"))
            return
        self.dirs[dpath].append(entry)
        if dpath not in self.files:
            # The parent directory needs a directory entry
            cont = {'_time': entry.time}
            ne = DirEntry(dpath, cont)
            self._add_file(dpath, ne)

    def add_entry(self, entry):
        """Add entry to file/pathnames and directories.
        Will add necessary entries for parent files/directories that lead up to this file if
        they are missing.
        """
        self._add_file(entry.pathname, entry)
        if isinstance(entry, (ZipDirEntry, ZipFileEntry)):
            logging.log(logging.DEBUG, f"add_entry zip file/dir entry for path {entry.pathname} in dir {entry.parent}")


def make_sub_path(a_path, sub):
    """Make submissionm path prefix as "assignment_name"/<..>/student_name".
    Optionally injects (in the following path order):
    - submission status (submitted or not)
    - grade status
    - group name
    """
    sname = sub['student_name']
    group = sub.get("group", None)
    parts = [a_path, sname]

    if by_group and group is not None and group['name'] is not None:
        gname = group['name']
        parts.insert(1, gname)

    if by_grade:
        # TODO: consider setting ungraded as something else than None
        grade = str(sub['entered_grade'])
        parts.insert(1, grade)

    if by_submitted:
        if sub['workflow_state'] == 'unsubmitted':
            parts.insert(1, "unsubmitted")
        else:
            parts.insert(1, "submitted")

    sub_path = "/".join(parts)    # TODO: str(Path(*parts)) ?
    return sub_path


def mount_fs():
    # Make sure the cache directory exists
    os.makedirs(CACHE_DIR, exist_ok=True)

    ctx = Context()

    # The json file contains a list of assignments.
    assignments = json.loads(open(f"{CACHE_DIR}/assignments.json").read())

    # For each level in the hiearchy, a .meta file is added with json encoded metadata for that level in the directory.
    # The information is filtered to avoid replicating everything from a further in at the root level.
    for a in assignments:
        # Top level directory for each assignment.
        a_path = '/' + a['name']
        ctx.add_entry(DirEntry(a_path, a, time_entry='created_at'))
        ctx.add_entry(MetaEntry(a_path, a, time_entry='updated_at', filter_entries={'f_studs', 'f_submissions'}))
        # logging.log(logging.DEBUG, f"{dirs}")
        for sub in a['f_submissions']:
            # Each submission is in a subdirectory with the name of the student.
            # sub_path = f"{a_path}/{sub['student_name']}"
            sub_path = make_sub_path(a_path, sub)
            # Students that haven't submitted still show up, but submitted_at is non-existing. This gives us a 0 epoch time.
            ctx.add_entry(DirEntry(sub_path, sub, time_entry='submitted_at'))
            ctx.add_entry(MetaEntry(sub_path, sub, time_entry='submitted_at', filter_entries={'submission_history'}))
            for s in sub['submission_history']:
                # Each version of the submission is listed in a separate subdirectory
                if s['attempt'] is None:
                    # Student hasn't submitted anything.
                    continue
                attempt_path = f"{sub_path}/{s['attempt']}"
                ctx.add_entry(DirEntry(attempt_path, s, time_entry='submitted_at'))
                ctx.add_entry(MetaEntry(attempt_path, s, time_entry='submitted_at'))
                for att in s.get('attachments', []):
                    # Each file in the submission
                    fpath = f"{attempt_path}/{att['filename']}"
                    if ZipEntry.possible_archive(fpath):
                        # Note: the 'unp' directory is not added until the zip file is downloaded (by reading it)
                        # The reason for this is to avoid triggering downloads of all zip files using "find", file managers etc.
                        # TODO: option to turn this mount time unpacking off.
                        ctx.add_entry(ZipEntry(fpath, att, ctx, time_entry='modified_at'))
                    else:
                        ctx.add_entry(Entry(fpath, att, time_entry='modified_at'))

    ctx.add_entry(DebugEntry())
    global auto_unpack
    auto_unpack = True
    print("dedup cache info: ", ddmcache.cache_info())
    print("Ready")

    FUSE(ctx, args.mount, foreground=True, ro=True, allow_other=True)


auto_unpack = False
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--cache', help="Cache directory")  # cache directory
    parser.add_argument('-bsub', '--by_submitted', action="store_true", help="Organize by submission_status")
    parser.add_argument('-bgrade', '--by_grade', action="store_true", help="Organize by entered grade")
    parser.add_argument('-bg', '--by_group', action="store_true", help="Organize by submission group")
    parser.add_argument('-nu', '--noautounpack', action="store_true", help="Do not unpack archives automatically on boot")
    parser.add_argument('mount')
    args = parser.parse_args()

    auto_unpack = not args.noautounpack
    by_group = args.by_group
    by_submitted = args.by_submitted
    by_grade = args.by_grade

    logging.basicConfig(level=LOG_LEVEL)

    if args.cache:
        CACHE_DIR = args.cache

    mount_fs()