digital-land · eveleighoj · Oct 28, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 15, 2024
diff --git a/.gitignore b/.gitignore
@@ -31,3 +31,7 @@ view_model.db
 
 # ignore act secrets
 .secrets
+
+# ignore loalstack volume and the data that's downloaded
+localstack/volume/
+localstack/bootstrap/local-collection-data/
diff --git a/Dockerfile b/Dockerfile
@@ -3,7 +3,7 @@ RUN mkdir -p app
 WORKDIR /app
 
 RUN apt-get update && \
-    apt-get install -y python3-dev gcc libsqlite3-mod-spatialite jq && \
+    apt-get install -y python3-dev gcc libsqlite3-mod-spatialite jq gettext curl && \
     rm -rf /var/lib/apt/lists/*
 
 ENV SQLITE_EXTENSIONS '/usr/lib/x86_64-linux-gnu/mod_spatialite.so'
@@ -13,10 +13,14 @@ RUN pip uninstall -y uvicorn
 RUN pip install uvicorn[standard] gunicorn
 RUN pip install csvkit
 
+COPY requirements.txt .
+RUN pip install --upgrade -r requirements.txt
+
 EXPOSE 5000
 ENV PORT=5000
 
 COPY startup.sh .
+COPY metadata_template.json .
 
 ADD templates /app/templates
 

diff --git a/Makefile b/Makefile
@@ -1,12 +1,29 @@
 .PHONY: init start clean
 
-init: ./files
+init::
+	pip install --upgrade pip
+ifneq (,$(wildcard requirements.txt))
+	pip3 install --upgrade -r requirements.txt
+endif 
+
 
 ./files:
-	@bash download-files.sh $$BUCKET
+	@bash bin/download-files.sh $$BUCKET
+
+./localstack/bootstrap/local-collection-data:
+	@bash  bin/download-s3-files.sh $$BUCKET
+
+start: ./files ./localstack/bootstrap/local-collection-data
+	docker-compose up -d
+
+start-no-cache:
+	docker-compose up -d --no-cache
 
-start: ./files
-	docker-compose up
+restart:
+	docker-compose restart datasette
 
+stop:
+	docker-compose down --rmi local
+
 clean:
 	@rm -rf ./files
diff --git a/README.md b/README.md
@@ -28,15 +28,25 @@ To test changes locally you will need the following requirements:
 You will also need AWS credentials in your environment, the preference is to use 
 [aws-vault](https://github.com/99designs/aws-vault) for this.
 
+You can either provide the collection data bucket which contains all the files or leave it blank to download a sample set of files. the sample set is downloaded using our cdn and
+is suitable for most purposes use the bucket if you want to get everrything.
+
+### With bucket and AWS Access
 You will also need the name of an S3 bucket that has the required sqlite files.
 
 `aws-vault exec dl-prod -- make start BUCKET=<environment>-collection-data`
 
+### Without Bucket
+
+`make start`
+
+This will download a sample set of files.
+
 # Licence
 
 The software in this project is open source and covered by the [LICENSE](LICENSE) file.
 
 Individual datasets copied into this repository may have specific copyright and licensing, otherwise all content and 
 data in this repository is [© Crown copyright](http://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/copyright-and-re-use/crown-copyright/) 
 and available under the terms of the [Open Government 3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) 
-licence.
+licence.
diff --git a/bin/download-files.sh b/bin/download-files.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+mkdir -p files
+
+BUCKET=$1
+
+# if bucket is provided then all files will be downloaded from the bucket  to give a mmore accurate
+# representation but advise is to leave bucket blank
+if [ -n "$BUCKET" ]; then
+  echo "The variable is not empty: $BUCKET"
+  FILES=$(aws s3api list-objects --bucket "$BUCKET" --output json --query "Contents[?ends_with(Key, 'sqlite3')]" | jq -rc '.[].Key')
+  for FILE in $FILES; do
+    echo "Downloading $FILE"
+    aws s3api get-object --bucket "$BUCKET" --key "$FILE" "./files/$(basename "$FILE")" > /dev/null
+    aws s3api get-object --bucket "$BUCKET" --key "$FILE.json" "./files/$(basename "$FILE").json" > /dev/null || echo "no inspect files"
+  done
+else
+  echo "The bucket variable is empty will download data from sample datasets using python"
+  python bin/download_files.py
+fi
+
+echo "All sqlite files downloaded successfully."
+
+cat ./files/*.json | jq -s add > ./files/inspect-data-all.json
diff --git a/bin/download-s3-files.sh b/bin/download-s3-files.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+mkdir -p localstack/bootstrap/local-collection-data
+
+BUCKET=$1
+
+# if bucket is provided then all files will be downloaded from the bucket  to give a mmore accurate
+# representation but advise is to leave bucket blank
+if [ -n "$BUCKET" ]; then
+  echo "Downloading log directory from: $BUCKET"  
+  aws s3 sync s3://$BUCKET_NAME/log ./localstack/bootstrap/local-collection-data/log
+  done
+else
+  echo "The bucket variable is empty will download data from sample datasets using python"
+  python bin/download_s3_files.py
+fi
+
+echo "All sqlite files downloaded successfully."
diff --git a/bin/download_files.py b/bin/download_files.py
@@ -0,0 +1,75 @@
+import click
+import sqlite3
+import urllib.request
+
+from pathlib import Path
+
+SQLITE_FILES = {
+#    collection : dataset
+   'central-activities-zone':'central-activities-zone',
+   'article-4-direction':'article-4-direction-area'
+}
+
+PARQUET_ISSUES = {
+#    dataset : resource
+   'article-4-direction-area':'936033805ce03700457da34ff3761ef7c305385ce3584e31dbc72c8a84298a6e'
+}
+
+@click.command()
+def download_files():
+
+    # download sqlite files, update the above to do it for other stuff
+    for collection, dataset in SQLITE_FILES.items():
+        # URL of the file to download
+        url = f"https://files.planning.data.gov.uk/{collection}-collection/dataset/{dataset}.sqlite3"
+        json_url = f"https://files.planning.data.gov.uk/{collection}-collection/dataset/{dataset}.sqlite3.json"
+        # Local filename to save as
+        file_name = f"{dataset}.sqlite3"
+        file_path = Path('files') / file_name
+
+        json_file_name = f"{dataset}.sqlite3.json"
+        json_file_path = Path('files') / json_file_name
+
+        try:
+            # Download the file
+            file_path.parent.mkdir(parents=True,exist_ok=True)
+            urllib.request.urlretrieve(url, file_path)
+            print(f"File downloaded successfully as '{file_name}'")
+            urllib.request.urlretrieve(json_url, json_file_path)
+            print(f"File downloaded successfully as '{json_file_name}'")
+
+        except urllib.error.URLError as e:
+            print(f"Failed to download file: {str(file_path)} error: {e}")
+
+    # download some sample issues
+    for dataset, resource in PARQUET_ISSUES.items():
+        url = f"https://files.planning.data.gov.uk/log/issue/dataset={dataset}/resource={resource}/{resource}.parquet"
+
+        # Local filename to save as
+        file_name = f"{resource}.parquet"
+        file_path = Path(f's3_files/log/issue/dataset={dataset}/resource={resource}') / file_name
+
+        try:
+            # Download the file
+            file_path.parent.mkdir(parents=True,exist_ok=True)
+            urllib.request.urlretrieve(url, file_path)
+            print(f"File downloaded successfully as '{file_name}'")
+
+        except urllib.error.URLError as e:
+            print(f"Failed to download file: {str(file_path)} error: {e}")
+
+    for db in ['digital-land.sqlite3','performance.sqlite3']:
+    # finally create empty digital-land and performance files
+        # Specify the name of the SQLite database file
+        db_name = "files/" + db
+
+        # Create a connection to the SQLite database
+        # If the file does not exist, it will be created
+        conn = sqlite3.connect(db_name)
+
+        # Close the connection
+        conn.close()
+
+
+if __name__ == "__main__":
+    download_files()
diff --git a/bin/download_s3_files.py b/bin/download_s3_files.py
@@ -0,0 +1,45 @@
+import click
+import sqlite3
+import urllib.request
+
+from pathlib import Path
+
+PARQUET_ISSUES = {
+#    dataset : resource
+   'article-4-direction-area':'936033805ce03700457da34ff3761ef7c305385ce3584e31dbc72c8a84298a6e'
+}
+
+@click.command()
+def download_s3_files():
+    # download some sample issues
+    for dataset, resource in PARQUET_ISSUES.items():
+        url = f"https://files.planning.data.gov.uk/log/issue/dataset={dataset}/resource={resource}/{resource}.parquet"
+
+        # Local filename to save as
+        file_name = f"{resource}.parquet"
+        file_path = Path(f's3_files/log/issue/dataset={dataset}/resource={resource}') / file_name
+
+        try:
+            # Download the file
+            file_path.parent.mkdir(parents=True,exist_ok=True)
+            urllib.request.urlretrieve(url, file_path)
+            print(f"File downloaded successfully as '{file_name}'")
+
+        except urllib.error.URLError as e:
+            print(f"Failed to download file: {str(file_path)} error: {e}")
+
+    for db in ['digital-land.sqlite3','performance.sqlite3']:
+    # finally create empty digital-land and performance files
+        # Specify the name of the SQLite database file
+        db_name = "files/" + db
+
+        # Create a connection to the SQLite database
+        # If the file does not exist, it will be created
+        conn = sqlite3.connect(db_name)
+
+        # Close the connection
+        conn.close()
+
+
+if __name__ == "__main__":
+    download_s3_files()
diff --git a/data/expectation/dataset=article-4-direction-area/article-4-direction-area.parquet b/data/expectation/dataset=article-4-direction-area/article-4-direction-area.parquet
diff --git a/...reservation-zone/00607d7d6a79999730f9f00755d021dbbeeafa74e7091cdcda66d75eb056ea42.parquet b/...reservation-zone/00607d7d6a79999730f9f00755d021dbbeeafa74e7091cdcda66d75eb056ea42.parquet
diff --git a/...reservation-zone/1869307c34d98200e93b60c85b5de2bafc8802c243ce37c526c566f1f7c70117.parquet b/...reservation-zone/1869307c34d98200e93b60c85b5de2bafc8802c243ce37c526c566f1f7c70117.parquet
diff --git a/...n=partition/dataset=tree-preservation-zone/tree-preservation-zone-combined-issues.parquet b/...n=partition/dataset=tree-preservation-zone/tree-preservation-zone-combined-issues.parquet
diff --git a/data/issues/partition=partition2/dataset=conservation-area/issues.parquet b/data/issues/partition=partition2/dataset=conservation-area/issues.parquet
diff --git a/data/issues/partition=partition2/dataset=conservation-area/issues_latest.parquet b/data/issues/partition=partition2/dataset=conservation-area/issues_latest.parquet
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,8 +1,54 @@
-version: "3.9"
+version: "3.8"
+
 services:
+  localstack:
+    image: localstack/localstack:3.8
+    container_name: "localstack"
+    ports:
+      - "127.0.0.1:4566:4566"
+      - "127.0.0.1:4510-4559:4510-4559"
+    environment:
+      DEBUG: ${DEBUG:-0}
+      AWS_DEFAULT_REGION: eu-west-2
+      SERVICES: s3
+    networks:
+      ls:
+        # Set the container IP address in the 10.0.2.0/24 subnet
+        ipv4_address: 10.0.2.20
+    volumes:
+      # Optional, uncomment if you need persistence or bootstrap scripts
+      - "${LOCALSTACK_VOLUME_DIR:-./localstack/volume}:/var/lib/localstack"
+      - "/var/run/docker.sock:/var/run/docker.sock"
+      - "./localstack/bootstrap:/etc/localstack/init/ready.d/"
+
   datasette:
     build: .
     ports:
       - "5000:5000"
+    environment:
+      COLLECTION_DATA_BUCKET: local-collection-data
+      AWS_ACCESS_KEY_ID: dummyaccess
+      AWS_SECRET_ACCESS_KEY: dummysecret
+      AWS_DEFAULT_REGION: eu-west-2
+      AWS_ENDPOINT_URL: http://localstack:4566
+      DUCKDB_S3_ENDPOINT: s3.localhost.localstack.cloud:4566
+      DUCKDB_S3_USE_SSL: false
+      USE_AWS_CREDENTIAL_CHAIN: false
+    dns:
+      # Set the DNS server to be the LocalStack container
+      - 10.0.2.20
+    networks:
+      - ls
     volumes:
       - ./files:/mnt/datasets
+    depends_on:
+      - localstack
+
+networks:
+  # localstack network is used so that requests to localstack resolve from containers
+  # See https://docs.localstack.cloud/references/network-troubleshooting/endpoint-url/
+  ls:
+    ipam:
+      config:
+        # Specify the subnet range for IP address allocation
+        - subnet: 10.0.2.0/24
diff --git a/download-files.sh b/download-files.sh
diff --git a/localstack/bootstrap/s3_bootstrap.sh b/localstack/bootstrap/s3_bootstrap.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# enable debug
+# set -x
+
+echo "configuring s3"
+echo "==================="
+LOCALSTACK_HOST=localhost
+AWS_REGION=eu-west-2
+
+base_local_path="/etc/localstack/init/ready.d/"
+
+create_bucket() {
+    local BUCKET_NAME_TO_CREATE=$1
+    awslocal --endpoint-url=http://${LOCALSTACK_HOST}:4566 s3api create-bucket --bucket ${BUCKET_NAME_TO_CREATE} --region ${AWS_REGION} --create-bucket-configuration LocationConstraint=${AWS_REGION}
+#    awslocal --endpoint-url=http://${LOCALSTACK_HOST}:4566 s3api put-bucket-cors --bucket ${BUCKET_NAME_TO_CREATE} --cors-configuration file:///etc/localstack/init/ready.d/cors-config.json
+}
+
+upload_dir_to_bucket() {
+    local LOCAL_PATH=$1
+    local S3_URI=$2
+    awslocal s3 sync ${LOCAL_PATH} ${S3_URI}
+}
+
+create_bucket "local-collection-data"
+upload_dir_to_bucket "${base_local_path}/local-collection-data" "s3://local-collection-data"
diff --git a/metadata_template.json b/metadata_template.json
@@ -0,0 +1,16 @@
+{
+  "title": "Planning Data",
+  "description": "Datasette instance for querying sqlite and parquet files produced from our pipeline",
+  "settings": {  
+    "suggest_facets": false
+  },
+  "plugins": {
+    "digital-land-datasette": {
+      "log": {
+        "directory": "s3://$COLLECTION_DATA_BUCKET/log/",
+        "httpfs": true
+      }
+    }
+  }
+}
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+-e git+https://github.com/digital-land/digital-land-datasette.git@datasette_parquet_spike#egg=digital_land_datasette
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		-e git+https://github.com/digital-land/digital-land-datasette.git@datasette_parquet_spike#egg=digital_land_datasette