Skip to content

Commit 6bdc3ff

Browse files
authored
Archiving sensitive collections (#299)
* archiving sensitive collections * sensitive collections data * censoring sensitive collections documentation
1 parent 1370d5f commit 6bdc3ff

File tree

9 files changed

+5807
-0
lines changed

9 files changed

+5807
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,4 @@ research/avatars/*.xlsx
3434
.env
3535
data/w2v/
3636
data/wiki2v/
37+
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from __future__ import annotations
2+
3+
from argparse import ArgumentParser
4+
import os
5+
6+
from elasticsearch import Elasticsearch
7+
8+
9+
def connect_to_elasticsearch(
10+
scheme: str,
11+
host: str,
12+
port: int,
13+
username: str,
14+
password: str,
15+
):
16+
return Elasticsearch(
17+
hosts=[{
18+
'scheme': scheme,
19+
'host': host,
20+
'port': port
21+
}],
22+
http_auth=(username, password),
23+
timeout=60,
24+
http_compress=True,
25+
)
26+
27+
28+
def archive_collections(es: Elasticsearch, collection_ids: list[str]):
29+
for collection_id in collection_ids:
30+
res = es.update(index=index, id=collection_id, body={
31+
'doc': {
32+
'data': {
33+
'archived': True
34+
}
35+
}
36+
})
37+
print(res)
38+
39+
40+
if __name__ == '__main__':
41+
parser = ArgumentParser(description='Takes a TXT file where each line is a collection ID, and archives them in '
42+
'Elasticsearch by setting the "archived" field to True')
43+
parser.add_argument('--input', type=str, required=True, help='TXT file with collection IDs to archive')
44+
args = parser.parse_args()
45+
46+
host = os.getenv('ES_HOST', 'localhost')
47+
port = int(os.getenv('ES_PORT', '9200'))
48+
username = os.getenv('ES_USERNAME', 'elastic')
49+
password = os.getenv('ES_PASSWORD', 'espass')
50+
index = os.getenv('ES_INDEX', 'collection-templates-1')
51+
52+
es = connect_to_elasticsearch(
53+
scheme='http' if host in ['localhost', '127.0.0.1'] else 'https',
54+
host=host, port=port, username=username, password=password,
55+
)
56+
57+
with open(args.input, 'r', encoding='utf-8') as f:
58+
collection_ids = [line.strip() for line in f.readlines() if line.strip()]
59+
60+
archive_collections(es, collection_ids)
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
from __future__ import annotations
2+
3+
from argparse import ArgumentParser
4+
from dataclasses import dataclass
5+
import json
6+
import csv
7+
import os
8+
9+
from elasticsearch import Elasticsearch
10+
11+
12+
def connect_to_elasticsearch(
13+
scheme: str,
14+
host: str,
15+
port: int,
16+
username: str,
17+
password: str,
18+
):
19+
return Elasticsearch(
20+
hosts=[{
21+
'scheme': scheme,
22+
'host': host,
23+
'port': port
24+
}],
25+
http_auth=(username, password),
26+
timeout=60,
27+
http_compress=True,
28+
)
29+
30+
31+
@dataclass
32+
class Collection:
33+
id: str
34+
name: str
35+
related_collections: list[Collection]
36+
37+
38+
def search_by_keyword(es: Elasticsearch, keywords: list[str]) -> list[Collection]:
39+
query = {
40+
'query': {
41+
'bool': {
42+
'should': [
43+
{
44+
'query_string': {
45+
'query': keyword,
46+
'fields': ['data.collection_name', 'data.collection_name.exact'],
47+
'type': 'cross_fields',
48+
'default_operator': 'AND',
49+
}
50+
}
51+
for keyword in keywords
52+
]
53+
}
54+
},
55+
'_source': ['data.collection_name', 'name_generator.related_collections']
56+
}
57+
res = es.search(index=index, body=query, size=3000)
58+
59+
collections = []
60+
for hit in res['hits']['hits']:
61+
related_collections = []
62+
if 'name_generator' in hit['_source']:
63+
for related_collection in hit['_source']['name_generator']['related_collections']:
64+
related_collections.append(Collection(
65+
id=related_collection['collection_id'],
66+
name=related_collection['collection_name'],
67+
related_collections=[]
68+
))
69+
70+
collections.append(Collection(
71+
id=hit['_id'],
72+
name=hit['_source']['data']['collection_name'],
73+
related_collections=related_collections
74+
))
75+
return collections
76+
77+
78+
if __name__ == '__main__':
79+
parser = ArgumentParser(description='This scripts takes a list of keywords and searches for collections in '
80+
'Elasticsearch. We then write the result collections and all the related to it '
81+
'collections to a CSV file. We also write a JSON file with name to ID mapping.')
82+
parser.add_argument('--input', type=str, required=True, help='TXT file with keywords')
83+
parser.add_argument('--output', type=str, required=True, help='output CSV file')
84+
parser.add_argument('--mapping-output', type=str, required=True, help='output JSON name to ID mapping file')
85+
parser.add_argument('--filter-duplicates', action='store_true', help='filter out duplicate collections')
86+
args = parser.parse_args()
87+
88+
host = os.getenv('ES_HOST', 'localhost')
89+
port = int(os.getenv('ES_PORT', '9200'))
90+
username = os.getenv('ES_USERNAME', 'elastic')
91+
password = os.getenv('ES_PASSWORD', 'espass')
92+
index = os.getenv('ES_INDEX', 'collection-templates-1')
93+
94+
es = connect_to_elasticsearch(
95+
scheme='http' if host in ['localhost', '127.0.0.1'] else 'https',
96+
host=host, port=port, username=username, password=password,
97+
)
98+
99+
with open(args.input, 'r', encoding='utf-8') as f:
100+
keywords = [line.strip() for line in f.readlines() if line.strip()]
101+
102+
collections = search_by_keyword(es, keywords)
103+
104+
mapping = {}
105+
for collection in collections:
106+
mapping[collection.name] = collection.id
107+
for related_collection in collection.related_collections:
108+
mapping[related_collection.name] = related_collection.id
109+
110+
with open(args.mapping_output, 'w', encoding='utf-8') as f:
111+
json.dump(mapping, f, ensure_ascii=False, indent=2)
112+
113+
used_collection_ids = set()
114+
with open(args.output, 'w', encoding='utf-8') as f:
115+
writer = csv.writer(f)
116+
for collection in collections:
117+
row = []
118+
if collection.id not in used_collection_ids or not args.filter_duplicates:
119+
row.append(collection.name)
120+
else:
121+
row.append('')
122+
used_collection_ids.add(collection.id)
123+
124+
for related_collection in collection.related_collections:
125+
if related_collection.id not in used_collection_ids or not args.filter_duplicates:
126+
row.append(related_collection.name)
127+
used_collection_ids.add(related_collection.id)
128+
129+
writer.writerow(row)

0 commit comments

Comments
 (0)