Merge pull request #12 from OpenDataServices/2024-10-04

James (ODSC) · web-flow · commit 7034f2c2c0b4 · 2024-10-08T09:12:38.000+01:00
docs content
diff --git a/docs/conf.py b/docs/conf.py
@@ -1,5 +1,27 @@
+import os
+import pathlib
+import sys
+
+import django
+
+import libcoveweb2.settings
+
 project = "LibCoveWeb2"
 
 master_doc = "index"
 
 html_theme = "odsc_default_sphinx_theme"
+
+extensions = [
+    "sphinx.ext.autodoc",
+]
+
+###### Make Sphinx able to document our python code
+
+# We need to be able to see our code, so add this directory to path
+sys.path.insert(0, str(pathlib.Path("..").resolve()))
+
+# We need to set up Django enough that it won't complain
+libcoveweb2.settings.INSTALLED_APPS += ("libcoveweb2",)
+os.environ["DJANGO_SETTINGS_MODULE"] = "libcoveweb2.settings"
+django.setup()
diff --git a/docs/django-settings.rst b/docs/django-settings.rst
@@ -0,0 +1,48 @@
+Django Settings
+===============
+
+To use this app you'll need to define several settings
+
+
+
+Process Tasks
+-------------
+
+You need to define a `PROCESS_TASKS` setting. This lists all the tasks that will be processed for each uploaded data, in order of processing.
+
+It should be a list of tuples and every tuple should be `('Python module', 'Python class name')`.
+Each class should extend libcoveweb2.process.base.ProcessDataTask
+
+Example:
+
+.. code-block:: python
+
+    PROCESS_TASKS = [
+        # Get data if not already on disk
+        ("libcoveweb2.process.common_tasks.download_data_task", "DownloadDataTask"),
+        ...
+    ]
+
+Celery Message Queue
+--------------------
+
+Any Celery settings needed must be set up.
+
+At a minimum this will include `CELERY_BROKER_URL`.
+
+
+Settings to copy from library which have sensible defaults
+----------------------------------------------------------
+
+This application also needs a bunch of configuration values that already have defaults set. In most cases you can just reuse these variables.
+
+:doc:`For a list of these settings see here. <python-api/settings>`
+
+To do so, you can do something like this in your Django project's main setting.py file:
+
+.. code-block:: python
+
+    from libcoveweb2 import settings
+    ALLOWED_JSON_CONTENT_TYPES = settings.ALLOWED_JSON_CONTENT_TYPES
+    ALLOWED_JSON_EXTENSIONS = settings.ALLOWED_JSON_EXTENSIONS
+    ...
diff --git a/docs/hosting/requirements.rst b/docs/hosting/requirements.rst
@@ -0,0 +1,30 @@
+Hosting Requirements
+====================
+
+Python server for the Django app
+----------------------------------
+
+Normal options
+
+Database
+----------
+
+This is tested with PostgreSQL.
+
+Message queue compatible with Celery
+--------------------------------------
+
+Normal options
+
+File Storage
+------------
+
+TODO
+
+Cron tasks
+----------
+
+Some Django management commands should be run on a cron task.
+
+* `expire_files` should be run daily
+
diff --git a/docs/index.rst b/docs/index.rst
@@ -1,3 +1,31 @@
 LibCoveWeb2
 ===========
 
+
+LibCoveWeb2 is a Django application to use as a library in your own Django apps.
+
+It let's you create CoVE apps - CoVE exists to help people:
+
+* Convert data between common formats (e.g. csv to json)
+* Validate data against rules
+* Explore data, that machines find easy, but humans find harder to read
+
+The application consists of:
+
+* Database Models to save details of user submitted data
+* File storage space to save the user submitted data and cache results of processing
+* A message queue
+* Workers to process the data according to tasks you provide (but there is a library of common tasks in this application)
+* A view to show users output information from the cache of results
+
+
+.. toctree::
+   :maxdepth: 2
+
+   processing-pipeline.rst
+   django-settings.rst
+   python-api/index.rst
+   migration-from-lib-cove-web.rst
+   hosting/requirements.rst
+   used-by.rst
+
diff --git a/docs/migration-from-lib-cove-web.rst b/docs/migration-from-lib-cove-web.rst
@@ -0,0 +1,13 @@
+Migration from lib-cove-web
+===========================
+
+
+This library is an updated version of the previous library ( https://github.com/OpenDataServices/lib-cove-web ).
+
+However, the structure of the library and how it works have massive changes.
+For this reason the upgrade path isn't a simple upgrade, but requires rewriting work in the software that uses this.
+
+This is why this library is a separate git repository and a separate pypi package,
+so that development of the previous library can also continue and users are not forced to upgrade before they are ready.
+
+
diff --git a/docs/processing-pipeline.rst b/docs/processing-pipeline.rst
@@ -0,0 +1,84 @@
+Processing Pipeline
+===================
+
+Why?
+----
+
+The application lets you define a series of tasks that will be checked for each bit of uploaded data, in order.
+
+Tasks need to be defined by each app, but there is a library of common tasks to make this easier.
+
+This allows for maximum flexibility - each app can define the tasks they need, including non-standard tasks that are not used by other CoVE's.
+(For example, BODS CoVE has a sample mode. When the user uploads big data, they can choose to run sample mode and only check some of it.
+This is accomplished by a special task towards the start of the pipeline that generates a smaller file from the uploaded file.)
+
+What happens when the user uploads data?
+----------------------------------------
+
+The background worker will start processing the data and the user will be redirected to the results page.
+
+What happens when the user looks at a results page?
+---------------------------------------------------
+
+Everytime a user views a results page, the system will check the state of that data.
+
+If it's currently being processed, the user will see a progress page with a wait message.
+
+If it's not currently being processed, the system will call `is_processing_applicable` and `is_processing_needed` functions on each task to see if any work is needed.
+
+If there is work to do, it will start the work and the user will see a progress page with a wait message.
+This means that even after a task first finishes, a task can change it's mind and request to do more work.
+(The most common use case for this is if the software is upgraded and how the processing is done is changed.)
+
+If there is no work to do, the system will show a results page to the user.
+`get_context` will be called on every task, so the task can load results from it's cache and present them to the user.
+
+Other pages that may be shown to the user include:
+    * An error page if a Python error occurred
+    * An expired page, if the data is so old that it has been expired and removed from the system
+
+How is the data actually processed?
+-----------------------------------
+
+To process the task, the background worker will call `process`.
+This can take as long as it needs, and the results should be cached for speedy loading later.
+
+Early tasks can also return data that will be passed to later tasks.
+This means any information or work that is needed in multiple tasks does not need to be done multiple times, but can be done once then reused.
+
+
+How should I define my tasks?
+-----------------------------
+
+
+Each task should be defined by extending a class. :doc:`For more information on the base class, see here. <python-api/process/base>`
+
+And your tasks should then be defined in settings. :doc:`For more information on settings, see here. <django-settings>`
+
+An example task pipeline
+------------------------
+
+.. code-block:: python
+
+
+        PROCESS_TASKS = [
+            # Get data if not already on disk - if the user provided a URL
+            ("libcoveweb2.process.common_tasks.download_data_task", "DownloadDataTask"),
+            # BOD's has a special Sample mode.
+            # If that's activated, we'll make the sample data now for later tasks to use.
+            ("cove_bods.process", "Sample"),
+            # Make sure uploads are in primary format - for BOD's that is JSON
+            # So any spreadsheets uploaded should be converted
+            ("cove_bods.process", "WasJSONUploaded"),
+            ("cove_bods.process", "ConvertSpreadsheetIntoJSON"),
+            # Some information is reused in multiple tasks to come
+            # So we'll process it once now and later tasks can reuse it.
+            ("cove_bods.process", "GetDataReaderAndConfigAndSchema"),
+            # Convert from primary JSON format into other output formats
+            ("cove_bods.process", "ConvertJSONIntoSpreadsheets"),
+            # Check and generate statistics from the JSON data
+            ("cove_bods.process", "AdditionalFieldsChecksTask"),
+            ("cove_bods.process", "PythonValidateTask"),
+            ("cove_bods.process", "JsonSchemaValidateTask"),
+        ]
+
diff --git a/docs/python-api/index.rst b/docs/python-api/index.rst
@@ -0,0 +1,15 @@
+Python API
+==========
+
+This section documents the Python API that software using this library should consider.
+
+It does not document Python that is not intended for reuse by others (you can read the source code for that.)
+
+
+.. toctree::
+   :maxdepth: 2
+
+   settings.rst
+   process/base.rst
+   process/common_tasks/download_data_task.rst
+   process/common_tasks/task_with_state.rst
diff --git a/docs/python-api/process/base.rst b/docs/python-api/process/base.rst
@@ -0,0 +1,11 @@
+Process Task Base
+=================
+
+:doc:`For more information on how to use this class, see here. <../../processing-pipeline>`
+
+You probably want to process some data, get a result and cache it.
+There is another class `TaskWithState` that helps you do this, and so may be more useful for you.
+:doc:`For more information on this helper class, see here. <common_tasks/task_with_state>`
+
+.. autoclass:: libcoveweb2.process.base.ProcessDataTask
+   :members:
diff --git a/docs/python-api/process/common_tasks/download_data_task.rst b/docs/python-api/process/common_tasks/download_data_task.rst
@@ -0,0 +1,5 @@
+Common Process Task: Download Data Task
+=======================================
+
+
+.. autoclass:: libcoveweb2.process.common_tasks.download_data_task.DownloadDataTask
diff --git a/docs/python-api/process/common_tasks/task_with_state.rst b/docs/python-api/process/common_tasks/task_with_state.rst
@@ -0,0 +1,7 @@
+Common Process Task: Task With State
+====================================
+
+
+.. autoclass:: libcoveweb2.process.common_tasks.task_with_state.TaskWithState
+   :members: state_filename, process_get_state
+
diff --git a/docs/python-api/settings.rst b/docs/python-api/settings.rst
@@ -0,0 +1,12 @@
+Settings
+========
+
+The application includes a settings module with several settings with default values.
+If you don't need to change them, you can just take the defaults from this module.
+
+:doc:`For more information on setting up the settings you need, see here. <../django-settings>`
+
+
+.. automodule:: libcoveweb2.settings
+   :members:
+
diff --git a/docs/used-by.rst b/docs/used-by.rst
@@ -0,0 +1,10 @@
+Used by
+=======
+
+This library is used by:
+
+* https://github.com/openownership/cove-bods
+* https://github.com/Open-Telecoms-Data/cove-ofds
+* https://github.com/GFDRR/rdls-cove
+
+
diff --git a/libcoveweb2/process/base.py b/libcoveweb2/process/base.py
@@ -18,12 +18,20 @@ def is_processing_applicable(self) -> bool:
 
         eg. A task to convert a spreadsheet to JSON will never be applicable
             if JSON is uploaded in the first place.
+
         eg. A task to check the data against JSON Schema will always be applicable.
+
+        This method is called on a user request on the web;
+        so it must not take a long time to finish.
         """
         return False
 
     def is_processing_needed(self) -> bool:
-        """Should return True if this task needs to do any processing"""
+        """Should return True if this task needs to do any processing.
+
+        This method is called on a user request on the web;
+        so it must not take a long time to finish.
+        """
         return False
 
     def process(self, process_data: dict) -> dict:
@@ -39,7 +47,11 @@ def process(self, process_data: dict) -> dict:
         You should do your own checks to make sure you are not doing unneeded work.
         This is so you can still add relevant info to process_data dict.
 
-        But it's not called if is_processing_applicable() is false."""
+        But it's not called if is_processing_applicable() is false.
+
+        This is only called by the background worker,
+        so it can take as long as it needs to do it's work.
+        """
         return process_data
 
     def get_context(self):
diff --git a/libcoveweb2/process/common_tasks/download_data_task.py b/libcoveweb2/process/common_tasks/download_data_task.py
@@ -2,7 +2,9 @@
 
 
 class DownloadDataTask(ProcessDataTask):
-    """If user gave us a URL, we download it now."""
+    """If user gave us a URL, we download it now.
+
+    It is possible for apps to use this class with no further configuration."""
 
     def is_processing_applicable(self) -> bool:
         for supplied_data_file in self.supplied_data_files:
diff --git a/libcoveweb2/process/common_tasks/task_with_state.py b/libcoveweb2/process/common_tasks/task_with_state.py
@@ -13,17 +13,22 @@ class TaskWithState(ProcessDataTask):
     Extend and provide your own state_filename and process_get_state.
     """
 
-    """Set state_filename to a unique name for each task.
-    If you change this name the task will be rerun, so this is is a good way to
-    make sure all underlying data changes if a new version of this bit of cove
-    is released."""
-    state_filename: str = "task_with_state.py"
+    #: Set state_filename to a unique name for each task.
+    #:
+    #: If you change this name the task will be rerun, so this is is a good way to
+    #: make sure all underlying data changes if a new version of this bit of cove
+    #: is released.
+    state_filename: str = "task_with_state.json"
 
     def process_get_state(self, process_data: dict):
-        """Should return a dict that is the state to save, and process_data.
-        Is only called if there is work to do, so does not need to worry about
-        checking that.
-        """
+        """Called to process data.
+
+        Is only called if there is work to do,
+        so does not need to worry about checking that.
+
+        Should return a tuple.
+        The first item is the results to save, as a dictionary.
+        The second item is process_data, as a dictionary."""
         return {}, process_data
 
     def process(self, process_data: dict) -> dict:
diff --git a/libcoveweb2/settings.py b/libcoveweb2/settings.py