Skip to content

Commit 87843f5

Browse files
committed
First commit
0 parents  commit 87843f5

14 files changed

+758
-0
lines changed

.github/workflows/pr.yml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
on:
2+
push:
3+
branches:
4+
- 'ai-stuff'
5+
pull_request:
6+
branches:
7+
- 'ai-stuff'
8+
9+
name: Review
10+
11+
jobs:
12+
changelog:
13+
runs-on: ubuntu-latest
14+
name: Changelog should be updated
15+
steps:
16+
- name: Checkout
17+
uses: actions/checkout@v4
18+
with:
19+
fetch-depth: 2
20+
21+
- name: Git fetch
22+
run: git fetch
23+
24+
- name: Check that changelog has been updated.
25+
run: git diff --exit-code origin/${{ github.base_ref }} -- CHANGELOG.md && exit 1 || exit 0
26+
27+
coding-standards-markdown:
28+
runs-on: ubuntu-latest
29+
steps:
30+
- name: Checkout
31+
uses: actions/checkout@v4
32+
33+
- name: Coding standards
34+
run: |
35+
docker run --rm --volume "$PWD:/md" peterdavehello/markdownlint markdownlint '**/*.md'
36+
37+
coding-standards-shellcheck:
38+
runs-on: ubuntu-latest
39+
steps:
40+
- name: Checkout
41+
uses: actions/checkout@v4
42+
43+
- name: Coding standards
44+
run: |
45+
docker run --rm --volume "$PWD:/mnt" koalaman/shellcheck:stable */*.sh

.markdownlint.jsonc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"default": true,
3+
// https://github.com/DavidAnson/markdownlint/blob/main/doc/md013.md
4+
"line-length": {
5+
"line_length": 120,
6+
"code_blocks": false,
7+
"tables": false
8+
},
9+
// https://github.com/DavidAnson/markdownlint/blob/main/doc/md024.md
10+
"no-duplicate-heading": {
11+
"siblings_only": true
12+
},
13+
// https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/organizing-information-with-collapsed-sections#creating-a-collapsed-section
14+
// https://github.com/DavidAnson/markdownlint/blob/main/doc/md033.md
15+
"no-inline-html": {
16+
"allowed_elements": ["details", "summary"]
17+
}
18+
}

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Changelog
2+
3+
All notable changes to this project will be documented in this file.
4+
5+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7+
8+
## [Unreleased]
9+
10+
### Added
11+
12+
- Data export script
13+
14+
[Unreleased]: https://github.com/itk-dev/os2loop/tree/ai-stuff

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# OS2Loop AI stuff
2+
3+
``` shell
4+
git clone --branch ai-stuff https://github.com/itk-dev/os2loop os2loop-ai-stuff
5+
```
6+
7+
## Scripts
8+
9+
``` shell
10+
./os2loop-ai-stuff/data-export/export.sh
11+
```
12+
13+
## Development
14+
15+
``` shell
16+
task
17+
```

Taskfile.yml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
version: '3'
2+
3+
tasks:
4+
default:
5+
cmds:
6+
- task --list
7+
silent: true
8+
9+
coding-standards:check:
10+
desc: "Apply coding standards and run checks"
11+
cmds:
12+
- task: coding-standards:apply
13+
- task: coding-standards:check:shellcheck
14+
15+
coding-standards:apply:
16+
desc: "Apply coding standards"
17+
cmds:
18+
- task: coding-standards:apply:markdownlint
19+
20+
coding-standards:apply:markdownlint:
21+
desc: "Run markdownlint-cli (https://github.com/igorshubovych/markdownlint-cli)"
22+
cmds:
23+
- docker run --rm --volume "$PWD:/md" peterdavehello/markdownlint markdownlint '**/*.md' --fix
24+
25+
coding-standards:check:markdownlint:
26+
desc: "Run markdownlint-cli (https://github.com/igorshubovych/markdownlint-cli)"
27+
cmds:
28+
- docker run --rm --volume "$PWD:/md" peterdavehello/markdownlint markdownlint '**/*.md'
29+
30+
coding-standards:check:shellcheck:
31+
desc: "Run ShellCheck (https://github.com/koalaman/shellcheck)"
32+
cmds:
33+
- docker run --rm --volume "$PWD:/mnt" koalaman/shellcheck:stable */*.sh

data-export/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*.csv
2+
*.json

data-export/export.sh

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/bin/env bash
2+
set -o errexit -o errtrace -o noclobber -o nounset -o pipefail
3+
IFS=$'\n\t'
4+
5+
script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
6+
7+
function usage() {
8+
if [ -n "${1:-}" ]; then
9+
>&2 cat <<EOF
10+
$1
11+
12+
EOF
13+
fi
14+
15+
>&2 cat <<EOF
16+
Usage: ${BASH_SOURCE[0]} project-dir site-uri
17+
18+
EOF
19+
exit 1
20+
}
21+
22+
if (( $# < 2 )); then
23+
usage "Too few arguments"
24+
fi
25+
26+
project_dir="$1"
27+
uri="$2"
28+
29+
if [ -z "$project_dir" ]; then
30+
usage "Invalid project directory"
31+
fi
32+
33+
if [ ! -d "$project_dir" ] ; then
34+
(>&2 echo 'Project directory "'"$project_dir"'" does not exist')
35+
exit 1
36+
fi
37+
38+
if [ -z "$uri" ]; then
39+
usage "Invalid site-uri"
40+
fi
41+
42+
cd "$project_dir"
43+
44+
filenames=("$script_dir"/export_*.sql)
45+
46+
for filename in "${filenames[@]}"; do
47+
echo "$filename"
48+
49+
# JSON
50+
51+
# https://tldp.org/LDP/abs/html/string-manipulation.html
52+
output_filename=${filename/%.sql/.json}
53+
# https://github.com/drush-ops/drush/issues/3071#issuecomment-347929777
54+
vendor/bin/drush --uri="$uri" php:eval "return \Drupal::database()->query(file_get_contents('$filename'))->fetchAll()" --format=json >| "$output_filename" || true
55+
echo "$output_filename"
56+
57+
# CSV
58+
59+
output_filename=${filename/%.sql/.csv}
60+
# https://stackoverflow.com/a/22421445/2502647
61+
vendor/bin/drush --uri="$uri" sql:cli < "$filename" | awk 'BEGIN { FS="\t"; OFS="," } {
62+
rebuilt=0
63+
for(i=1; i<=NF; ++i) {
64+
if ($i ~ /,/ && $i !~ /^".*"$/) {
65+
gsub("\"", "\"\"", $i)
66+
$i = "\"" $i "\""
67+
rebuilt=1
68+
}
69+
}
70+
if (!rebuilt) { $1=$1 }
71+
print
72+
}' >| "$output_filename" || true
73+
echo "$output_filename"
74+
done
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
Select
2+
n_fd.nid,
3+
n_fd.type,
4+
n_fd.title,
5+
pa.`alias` as relative_url,
6+
DATE_FORMAT(FROM_UNIXTIME(n_fd.created), '%Y-%m-%dT%H:%i:%s') as created,
7+
DATE_FORMAT(FROM_UNIXTIME(n_fd.changed), '%Y-%m-%dT%H:%i:%s') as `changed`,
8+
dci.document_node_ids,
9+
dci.document_relative_urls,
10+
docscol_content.os2loop_documents_dc_content_value as content, -- all format are rich text (html and div encoded)
11+
docs_ib.os2loop_documents_info_box_value as info_box,
12+
approval_date.os2loop_shared_approval_date_value as approval_date,
13+
`subject`.`name` as `subject`,
14+
tags.tags,
15+
`owner`.os2loop_shared_owner_value as `owner`,
16+
rev_date.os2loop_shared_rev_date_value as review_date,
17+
`version`.os2loop_shared_version_value as `version`
18+
from (
19+
SELECT nid,vid,type,uid,title,created,changed
20+
FROM node_field_data
21+
where type = 'os2loop_documents_collection'
22+
-- the table os2loop_documents_collection_item associate document collections (their nid on collection_id)
23+
-- to documents (document_id = nid) except for 20 document collections. Fx case /rammedelegation nid=3807 it is a collection of
24+
-- links to sharepoint docs and /medicinhaandtering nid=3827 is a link to the collection
25+
-- /instruks-korrekt-haandtering-af-medicin-i-sundhed-og-omsorg-mso nid 4188
26+
-- of 805 documents 164 documents are not assigned to a document_collection
27+
) as n_fd
28+
left join path_alias as pa on CONCAT('/node/',n_fd.nid) = pa.path
29+
left join (
30+
SELECT
31+
doc_col_itm.collection_id,
32+
json_arrayagg(doc_col_itm.document_id) as document_node_ids,
33+
json_arrayagg(pa.`alias`) as document_relative_urls
34+
from os2loop_documents_collection_item as doc_col_itm
35+
left join path_alias as pa on CONCAT('/node/',doc_col_itm.document_id) = pa.path
36+
group by doc_col_itm.collection_id
37+
) as dci on n_fd.nid = dci.collection_id
38+
left join node__os2loop_documents_dc_content as docscol_content on n_fd.nid = docscol_content.entity_id -- contains only records from bundle documents_collection (all delta 0, so top placement)
39+
left join (
40+
SELECT
41+
entity_id,
42+
os2loop_documents_info_box_value
43+
FROM node__os2loop_documents_info_box
44+
WHERE bundle = 'os2loop_documents_collection') as docs_ib on n_fd.nid = docs_ib.entity_id -- only from bundle document_collection
45+
left join node__os2loop_shared_approval_date as approval_date on n_fd.nid = approval_date.entity_id
46+
left join (
47+
SELECT
48+
n_ss.entity_id,
49+
subject_tt_fd.name
50+
FROM node__os2loop_shared_subject as n_ss
51+
left join taxonomy_term_field_data as subject_tt_fd on n_ss.os2loop_shared_subject_target_id = subject_tt_fd.tid
52+
where n_ss.bundle = 'os2loop_documents_collection') as `subject` on n_fd.nid = `subject`.entity_id
53+
left join node__os2loop_shared_owner as `owner` on n_fd.nid = `owner`.entity_id
54+
left join node__os2loop_shared_rev_date as rev_date on n_fd.nid = rev_date.entity_id
55+
left join (
56+
SELECT
57+
n_st.entity_id,
58+
json_arrayagg(tt_fd.name) as tags
59+
FROM node__os2loop_shared_tags as n_st
60+
left join taxonomy_term_field_data as tt_fd on n_st.os2loop_shared_tags_target_id = tt_fd.tid
61+
where n_st.bundle = 'os2loop_documents_collection'
62+
group by n_st.entity_id) tags on n_fd.nid = tags.entity_id
63+
left join node__os2loop_shared_version as `version` on n_fd.nid = `version`.entity_id

0 commit comments

Comments
 (0)