Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9c69ee8
Datasette-parquet plugin implementation
ssadhu-sl Oct 28, 2024
89faf5f
uses dir structure
CarlosCoelhoSL Nov 14, 2024
1061d88
point to digital-land-datasette
CarlosCoelhoSL Nov 14, 2024
44e1f84
use env variables for metadata.json
CarlosCoelhoSL Nov 15, 2024
1bd997e
adds missing ! to environment check
CarlosCoelhoSL Nov 15, 2024
7ef2c72
removes unnecessary config in metadata.json
CarlosCoelhoSL Nov 19, 2024
8bf7b75
updates env variable names to match existing names
CarlosCoelhoSL Nov 19, 2024
d3016f7
changes for dev deployment
CarlosCoelhoSL Nov 19, 2024
52b6c88
adds metadata_template.json to dockerfile
CarlosCoelhoSL Nov 19, 2024
fd901e8
adds make init to Dockerfile
CarlosCoelhoSL Nov 19, 2024
7be17f5
adjusts requirements.txt installation in Dockerfile
CarlosCoelhoSL Nov 19, 2024
5d30c98
adds old startup.sh content back in
CarlosCoelhoSL Nov 19, 2024
f90a57c
adds apt-get for envsubst and spacing in startup.sh
CarlosCoelhoSL Nov 22, 2024
30fbc4a
removes old startup.sh content
CarlosCoelhoSL Nov 22, 2024
9e27f3d
make a lot of changes
eveleighoj Nov 25, 2024
808fdb3
final changes to ensure its working with COLLECTION_DATA_BUCKET
eveleighoj Nov 25, 2024
7a01884
add more ocmmments
eveleighoj Nov 25, 2024
f71ede9
remove metadata.json to see if it starts without it
eveleighoj Nov 25, 2024
75df667
add metadata.json back in and add debug
eveleighoj Nov 25, 2024
9e15fd7
remove debug
eveleighoj Nov 25, 2024
3b42bd3
remove plugin details
eveleighoj Nov 25, 2024
d98da8f
add metadata back in
eveleighoj Nov 26, 2024
4a5080b
add code to check bucket exists
eveleighoj Nov 26, 2024
463d461
add aws cli
eveleighoj Nov 26, 2024
657cb57
try og method
eveleighoj Nov 26, 2024
d7368ac
print error message
eveleighoj Nov 26, 2024
27ca11c
ty without installing cli
eveleighoj Nov 26, 2024
1904a44
run a ping
eveleighoj Nov 26, 2024
37743f4
change to curl
eveleighoj Nov 26, 2024
1a0fbfb
add verbose
eveleighoj Nov 26, 2024
ef3bf1e
remove additional echos
eveleighoj Nov 27, 2024
ef39884
add changes to startup and compose files
eveleighoj Nov 28, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,7 @@ view_model.db

# ignore act secrets
.secrets

# ignore loalstack volume and the data that's downloaded
localstack/volume/
localstack/bootstrap/local-collection-data/
6 changes: 5 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ RUN mkdir -p app
WORKDIR /app

RUN apt-get update && \
apt-get install -y python3-dev gcc libsqlite3-mod-spatialite jq && \
apt-get install -y python3-dev gcc libsqlite3-mod-spatialite jq gettext curl && \
rm -rf /var/lib/apt/lists/*

ENV SQLITE_EXTENSIONS '/usr/lib/x86_64-linux-gnu/mod_spatialite.so'
Expand All @@ -13,10 +13,14 @@ RUN pip uninstall -y uvicorn
RUN pip install uvicorn[standard] gunicorn
RUN pip install csvkit

COPY requirements.txt .
RUN pip install --upgrade -r requirements.txt

EXPOSE 5000
ENV PORT=5000

COPY startup.sh .
COPY metadata_template.json .

ADD templates /app/templates

Expand Down
25 changes: 21 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
.PHONY: init start clean

init: ./files
init::
pip install --upgrade pip
ifneq (,$(wildcard requirements.txt))
pip3 install --upgrade -r requirements.txt
endif


./files:
@bash download-files.sh $$BUCKET
@bash bin/download-files.sh $$BUCKET

./localstack/bootstrap/local-collection-data:
@bash bin/download-s3-files.sh $$BUCKET

start: ./files ./localstack/bootstrap/local-collection-data
docker-compose up -d

start-no-cache:
docker-compose up -d --no-cache

start: ./files
docker-compose up
restart:
docker-compose restart datasette

stop:
docker-compose down --rmi local

clean:
@rm -rf ./files
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,25 @@ To test changes locally you will need the following requirements:
You will also need AWS credentials in your environment, the preference is to use
[aws-vault](https://github.com/99designs/aws-vault) for this.

You can either provide the collection data bucket which contains all the files or leave it blank to download a sample set of files. the sample set is downloaded using our cdn and
is suitable for most purposes use the bucket if you want to get everrything.

### With bucket and AWS Access
You will also need the name of an S3 bucket that has the required sqlite files.

`aws-vault exec dl-prod -- make start BUCKET=<environment>-collection-data`

### Without Bucket

`make start`

This will download a sample set of files.

# Licence

The software in this project is open source and covered by the [LICENSE](LICENSE) file.

Individual datasets copied into this repository may have specific copyright and licensing, otherwise all content and
data in this repository is [© Crown copyright](http://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/copyright-and-re-use/crown-copyright/)
and available under the terms of the [Open Government 3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/)
licence.
licence.
24 changes: 24 additions & 0 deletions bin/download-files.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env bash

mkdir -p files

BUCKET=$1

# if bucket is provided then all files will be downloaded from the bucket to give a mmore accurate
# representation but advise is to leave bucket blank
if [ -n "$BUCKET" ]; then
echo "The variable is not empty: $BUCKET"
FILES=$(aws s3api list-objects --bucket "$BUCKET" --output json --query "Contents[?ends_with(Key, 'sqlite3')]" | jq -rc '.[].Key')
for FILE in $FILES; do
echo "Downloading $FILE"
aws s3api get-object --bucket "$BUCKET" --key "$FILE" "./files/$(basename "$FILE")" > /dev/null
aws s3api get-object --bucket "$BUCKET" --key "$FILE.json" "./files/$(basename "$FILE").json" > /dev/null || echo "no inspect files"
done
else
echo "The bucket variable is empty will download data from sample datasets using python"
python bin/download_files.py
fi

echo "All sqlite files downloaded successfully."

cat ./files/*.json | jq -s add > ./files/inspect-data-all.json
18 changes: 18 additions & 0 deletions bin/download-s3-files.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env bash

mkdir -p localstack/bootstrap/local-collection-data

BUCKET=$1

# if bucket is provided then all files will be downloaded from the bucket to give a mmore accurate
# representation but advise is to leave bucket blank
if [ -n "$BUCKET" ]; then
echo "Downloading log directory from: $BUCKET"
aws s3 sync s3://$BUCKET_NAME/log ./localstack/bootstrap/local-collection-data/log
done
else
echo "The bucket variable is empty will download data from sample datasets using python"
python bin/download_s3_files.py
fi

echo "All sqlite files downloaded successfully."
75 changes: 75 additions & 0 deletions bin/download_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import click
import sqlite3
import urllib.request

from pathlib import Path

SQLITE_FILES = {
# collection : dataset
'central-activities-zone':'central-activities-zone',
'article-4-direction':'article-4-direction-area'
}

PARQUET_ISSUES = {
# dataset : resource
'article-4-direction-area':'936033805ce03700457da34ff3761ef7c305385ce3584e31dbc72c8a84298a6e'
}

@click.command()
def download_files():

# download sqlite files, update the above to do it for other stuff
for collection, dataset in SQLITE_FILES.items():
# URL of the file to download
url = f"https://files.planning.data.gov.uk/{collection}-collection/dataset/{dataset}.sqlite3"
json_url = f"https://files.planning.data.gov.uk/{collection}-collection/dataset/{dataset}.sqlite3.json"
# Local filename to save as
file_name = f"{dataset}.sqlite3"
file_path = Path('files') / file_name

json_file_name = f"{dataset}.sqlite3.json"
json_file_path = Path('files') / json_file_name

try:
# Download the file
file_path.parent.mkdir(parents=True,exist_ok=True)
urllib.request.urlretrieve(url, file_path)
print(f"File downloaded successfully as '{file_name}'")
urllib.request.urlretrieve(json_url, json_file_path)
print(f"File downloaded successfully as '{json_file_name}'")

except urllib.error.URLError as e:
print(f"Failed to download file: {str(file_path)} error: {e}")

# download some sample issues
for dataset, resource in PARQUET_ISSUES.items():
url = f"https://files.planning.data.gov.uk/log/issue/dataset={dataset}/resource={resource}/{resource}.parquet"

# Local filename to save as
file_name = f"{resource}.parquet"
file_path = Path(f's3_files/log/issue/dataset={dataset}/resource={resource}') / file_name

try:
# Download the file
file_path.parent.mkdir(parents=True,exist_ok=True)
urllib.request.urlretrieve(url, file_path)
print(f"File downloaded successfully as '{file_name}'")

except urllib.error.URLError as e:
print(f"Failed to download file: {str(file_path)} error: {e}")

for db in ['digital-land.sqlite3','performance.sqlite3']:
# finally create empty digital-land and performance files
# Specify the name of the SQLite database file
db_name = "files/" + db

# Create a connection to the SQLite database
# If the file does not exist, it will be created
conn = sqlite3.connect(db_name)

# Close the connection
conn.close()


if __name__ == "__main__":
download_files()
45 changes: 45 additions & 0 deletions bin/download_s3_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import click
import sqlite3
import urllib.request

from pathlib import Path

PARQUET_ISSUES = {
# dataset : resource
'article-4-direction-area':'936033805ce03700457da34ff3761ef7c305385ce3584e31dbc72c8a84298a6e'
}

@click.command()
def download_s3_files():
# download some sample issues
for dataset, resource in PARQUET_ISSUES.items():
url = f"https://files.planning.data.gov.uk/log/issue/dataset={dataset}/resource={resource}/{resource}.parquet"

# Local filename to save as
file_name = f"{resource}.parquet"
file_path = Path(f's3_files/log/issue/dataset={dataset}/resource={resource}') / file_name

try:
# Download the file
file_path.parent.mkdir(parents=True,exist_ok=True)
urllib.request.urlretrieve(url, file_path)
print(f"File downloaded successfully as '{file_name}'")

except urllib.error.URLError as e:
print(f"Failed to download file: {str(file_path)} error: {e}")

for db in ['digital-land.sqlite3','performance.sqlite3']:
# finally create empty digital-land and performance files
# Specify the name of the SQLite database file
db_name = "files/" + db

# Create a connection to the SQLite database
# If the file does not exist, it will be created
conn = sqlite3.connect(db_name)

# Close the connection
conn.close()


if __name__ == "__main__":
download_s3_files()
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
48 changes: 47 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,54 @@
version: "3.9"
version: "3.8"

services:
localstack:
image: localstack/localstack:3.8
container_name: "localstack"
ports:
- "127.0.0.1:4566:4566"
- "127.0.0.1:4510-4559:4510-4559"
environment:
DEBUG: ${DEBUG:-0}
AWS_DEFAULT_REGION: eu-west-2
SERVICES: s3
networks:
ls:
# Set the container IP address in the 10.0.2.0/24 subnet
ipv4_address: 10.0.2.20
volumes:
# Optional, uncomment if you need persistence or bootstrap scripts
- "${LOCALSTACK_VOLUME_DIR:-./localstack/volume}:/var/lib/localstack"
- "/var/run/docker.sock:/var/run/docker.sock"
- "./localstack/bootstrap:/etc/localstack/init/ready.d/"

datasette:
build: .
ports:
- "5000:5000"
environment:
COLLECTION_DATA_BUCKET: local-collection-data
AWS_ACCESS_KEY_ID: dummyaccess
AWS_SECRET_ACCESS_KEY: dummysecret
AWS_DEFAULT_REGION: eu-west-2
AWS_ENDPOINT_URL: http://localstack:4566
DUCKDB_S3_ENDPOINT: s3.localhost.localstack.cloud:4566
DUCKDB_S3_USE_SSL: false
USE_AWS_CREDENTIAL_CHAIN: false
dns:
# Set the DNS server to be the LocalStack container
- 10.0.2.20
networks:
- ls
volumes:
- ./files:/mnt/datasets
depends_on:
- localstack

networks:
# localstack network is used so that requests to localstack resolve from containers
# See https://docs.localstack.cloud/references/network-troubleshooting/endpoint-url/
ls:
ipam:
config:
# Specify the subnet range for IP address allocation
- subnet: 10.0.2.0/24
16 changes: 0 additions & 16 deletions download-files.sh

This file was deleted.

28 changes: 28 additions & 0 deletions localstack/bootstrap/s3_bootstrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash

set -euo pipefail

# enable debug
# set -x

echo "configuring s3"
echo "==================="
LOCALSTACK_HOST=localhost
AWS_REGION=eu-west-2

base_local_path="/etc/localstack/init/ready.d/"

create_bucket() {
local BUCKET_NAME_TO_CREATE=$1
awslocal --endpoint-url=http://${LOCALSTACK_HOST}:4566 s3api create-bucket --bucket ${BUCKET_NAME_TO_CREATE} --region ${AWS_REGION} --create-bucket-configuration LocationConstraint=${AWS_REGION}
# awslocal --endpoint-url=http://${LOCALSTACK_HOST}:4566 s3api put-bucket-cors --bucket ${BUCKET_NAME_TO_CREATE} --cors-configuration file:///etc/localstack/init/ready.d/cors-config.json
}

upload_dir_to_bucket() {
local LOCAL_PATH=$1
local S3_URI=$2
awslocal s3 sync ${LOCAL_PATH} ${S3_URI}
}

create_bucket "local-collection-data"
upload_dir_to_bucket "${base_local_path}/local-collection-data" "s3://local-collection-data"
16 changes: 16 additions & 0 deletions metadata_template.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"title": "Planning Data",
"description": "Datasette instance for querying sqlite and parquet files produced from our pipeline",
"settings": {
"suggest_facets": false
},
"plugins": {
"digital-land-datasette": {
"log": {
"directory": "s3://$COLLECTION_DATA_BUCKET/log/",
"httpfs": true
}
}
}
}

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-e git+https://github.com/digital-land/digital-land-datasette.git@datasette_parquet_spike#egg=digital_land_datasette
Loading