Skip to content

Commit 63599c4

Browse files
Merge pull request #6 from ral-facilities/revert-5-revert-4-3_search_files
Revert "Revert "Add search_files.py, refactor common functionality into common.py"" Re-reverting this now that we are ready for this functionality to be on main
2 parents 5999c87 + 9643cd9 commit 63599c4

File tree

4 files changed

+313
-63
lines changed

4 files changed

+313
-63
lines changed

README.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,63 @@ options:
6868
complete with an interval of this many minutes. Non-
6969
positive values will disable monitoring.
7070
```
71+
72+
## search_files
73+
74+
Performs DataGateway searches for Datafiles matching the provided query.
75+
```bash
76+
python3 search_files.py input-file.txt 'visitId:"AB1234-1"' --username=abc12345
77+
```
78+
79+
Full help text and description of arguments is available with the `--help` command:
80+
```bash
81+
python3 search_files.py --help
82+
```
83+
```
84+
usage: search_files [-h] [--url URL] [-a AUTHENTICATOR] -u USERNAME
85+
[-p PASSWORD_FILE] [-m MAX_RESULTS]
86+
output_file query
87+
88+
Performs DataGateway searches for Datafiles matching the provided query. These paths will be written to file in
89+
batches, and can then be inspected and filtered further if needed before submitting using queue_file_downloads.
90+
91+
positional arguments:
92+
output_file File to append newline separated paths to.
93+
This can then be provided as an input to queue_file_downloads.
94+
query Lucene syntax formatted search query. Full help text and examples can be found
95+
in the DataGateway UI. Note that wildcards can significantly increase the time
96+
taken to perform a search, and the more specific the search query is the more
97+
efficient it will be. Some example searches are:
98+
'visitId:AB1234'
99+
Search for all Datafiles in all parts of proposal
100+
'visitId:"AB1234-1"'
101+
Search for all Datafiles in a (part) visit
102+
'location.fileName:"config.txt"'
103+
Search for Datafiles with a specific file name and extension (both required)
104+
'location.fileName:config'
105+
Search for Datafiles with a specific name but any extension
106+
'location.fileName:txt'
107+
Search for Datafiles with the extension 'txt', but no requirement on the name
108+
'location:raw'
109+
Search for Datafiles with the directory 'raw' somewhere in their path
110+
'location:(raw processed)'
111+
Search for Datafiles with either of two directories somewhere in their path
112+
'location.exact:/dls/i0/data/2000'
113+
Search for Datafiles in any subdirectory of the provided path (case sensitive)
114+
'location.exact:/dls/i0/data/202?/*/raw/config.txt'
115+
Search for a full path with wildcards (case sensitive)
116+
'+location.exact:/dls/i0/data/202? +location:(raw processed) +location.fileName:txt'
117+
Search for multiple criteria (all of which are required to match)
118+
119+
optional arguments:
120+
-h, --help show this help message and exit
121+
--url URL The url address of the DataGateway instance to submit requests to.
122+
-a AUTHENTICATOR, --authenticator AUTHENTICATOR
123+
The authentication mechanism to use for DataGateway login.
124+
-u USERNAME, --username USERNAME
125+
The username used for DataGateway login.
126+
-p PASSWORD_FILE, --password-file PASSWORD_FILE
127+
Location of file containing password for DataGateway login. If not provided, the password will need to be provided by prompt.
128+
-m MAX_RESULTS, --max-results MAX_RESULTS
129+
The maximum number of results to request in a single batch. If unset, the server default value will be used.
130+
```

common.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
from argparse import ArgumentParser
2+
from getpass import getpass
3+
import json
4+
from urllib.parse import quote
5+
6+
import requests
7+
8+
9+
VERIFY = True # If False then ssl certs won't be checked, useful for development
10+
11+
12+
def add_common_args(parser: ArgumentParser) -> None:
13+
"""
14+
Add the common arguments url, authenticator, username and password-file to `parser`.
15+
16+
Args:
17+
parser (ArgumentParser): ArgumentParser for the program.
18+
"""
19+
parser.add_argument(
20+
"--url",
21+
type=str,
22+
default="https://datagateway.diamond.ac.uk",
23+
help="The url address of the DataGateway instance to submit requests to.",
24+
)
25+
parser.add_argument(
26+
"-a",
27+
"--authenticator",
28+
type=str,
29+
default="ldap",
30+
help="The authentication mechanism to use for DataGateway login.",
31+
)
32+
parser.add_argument(
33+
"-u",
34+
"--username",
35+
type=str,
36+
required=True,
37+
help="The username used for DataGateway login.",
38+
)
39+
parser.add_argument(
40+
"-p",
41+
"--password-file",
42+
type=str,
43+
help=(
44+
"Location of file containing password for DataGateway login. If not "
45+
"provided, the password will need to be provided by prompt."
46+
),
47+
)
48+
49+
50+
def get_password(password_file: "str | None") -> str:
51+
"""
52+
Load the user's password from `password_file` if provided, otherwise prompt for it.
53+
54+
Args:
55+
password_file (str): Optional path of file containing the user's password.
56+
57+
Returns:
58+
str: The user's password
59+
"""
60+
if password_file is None:
61+
return getpass()
62+
else:
63+
with open(password_file) as f:
64+
return f.readline().strip()
65+
66+
67+
def login(base_url: str, authenticator: str, username: str, password: str) -> str:
68+
"""
69+
Args:
70+
base_url (str): URL for DataGateway without path.
71+
authenticator (str): Authentication mechanism to use.
72+
username (str): Username to use.
73+
password (str): Password to use.
74+
75+
Raises:
76+
RuntimeError: If a status code other than 200 is returned.
77+
78+
Returns:
79+
str: ICAT session id.
80+
"""
81+
url = f"{base_url}/topcat/user/session"
82+
encoded_password = quote(json.dumps(password)[1:-1])
83+
data = {"plugin": authenticator, "username": username, "password": encoded_password}
84+
response = requests.post(url=url, data=data, verify=VERIFY)
85+
if response.status_code != 200:
86+
raise RuntimeError(response.text)
87+
88+
return json.loads(response.content)["sessionId"]

queue_file_downloads.py

Lines changed: 9 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2,34 +2,11 @@
22

33
import argparse
44
from datetime import datetime
5-
from getpass import getpass
65
import json
76
from time import sleep
87
import requests
9-
from urllib.parse import quote
108

11-
def login(base_url: str, authenticator: str, username: str, password: str) -> str:
12-
"""
13-
Args:
14-
base_url (str): URL for DataGateway without path.
15-
authenticator (str): Authentication mechanism to use.
16-
username (str): Username to use.
17-
password (str): Password to use.
18-
19-
Raises:
20-
RuntimeError: If a status code other than 200 is returned.
21-
22-
Returns:
23-
str: ICAT session id.
24-
"""
25-
url = f"{base_url}/topcat/user/session"
26-
encoded_password = quote(json.dumps(password)[1:-1])
27-
data = {"plugin": authenticator, "username": username, "password": encoded_password}
28-
response = requests.post(url=url, data=data)
29-
if response.status_code != 200:
30-
raise RuntimeError(response.text)
31-
32-
return json.loads(response.content)["sessionId"]
9+
from common import VERIFY, add_common_args, get_password, login
3310

3411

3512
def queue_all_files(
@@ -119,14 +96,15 @@ def queue_files(
11996
Returns:
12097
int: The Download id.
12198
"""
99+
url = f"{base_url}/topcat/user/queue/files"
122100
data = {
123101
"sessionId": session_id,
124102
"transport": transport,
125103
"fileName": file_name,
126104
"email": email,
127105
"files": files,
128106
}
129-
response = requests.post(url=base_url + "/topcat/user/queue/files", data=data)
107+
response = requests.post(url=url, data=data, verify=VERIFY)
130108
if response.status_code != 200:
131109
raise RuntimeError(response.text)
132110

@@ -160,20 +138,21 @@ def monitor(
160138
"""
161139
url = base_url + "/topcat/user/downloads/status"
162140
params = {"sessionId": session_id, "downloadIds": downloads}
163-
response = requests.get(url=url, params=params)
141+
response = requests.get(url=url, params=params, verify=VERIFY)
164142
if response.status_code != 200:
165143
raise RuntimeError(response.text)
166144
content = json.loads(response.content)
167145
print(content)
168146

169147
while any([s in {"QUEUED", "PAUSED", "PREPARING", "RESTORING"} for s in content]):
148+
sessions_url = f"{base_url}/datagateway-api/sessions"
170149
headers = {"Authorization": f"Bearer {session_id}"}
171-
requests.put(url=base_url + "/datagateway-api/sessions", headers=headers)
150+
requests.put(url=sessions_url, headers=headers, verify=VERIFY)
172151
if response.status_code != 200:
173152
raise RuntimeError(response.text)
174153

175154
sleep(monitor_sleep * 60)
176-
response = requests.get(url=url, params=params)
155+
response = requests.get(url=url, params=params, verify=VERIFY)
177156
if response.status_code != 200:
178157
raise RuntimeError(response.text)
179158

@@ -205,35 +184,7 @@ def monitor(
205184
"displayed in the DataGateway UI."
206185
),
207186
)
208-
parser.add_argument(
209-
"--url",
210-
type=str,
211-
default="https://datagateway.diamond.ac.uk",
212-
help="The url address of the DataGateway instance to submit requests to.",
213-
)
214-
parser.add_argument(
215-
"-a",
216-
"--authenticator",
217-
type=str,
218-
default="ldap",
219-
help="The authentication mechanism to use for DataGateway login.",
220-
)
221-
parser.add_argument(
222-
"-u",
223-
"--username",
224-
type=str,
225-
required=True,
226-
help="The username used for DataGateway login.",
227-
)
228-
parser.add_argument(
229-
"-p",
230-
"--password-file",
231-
type=str,
232-
help=(
233-
"Location of file containing password for DataGateway login. If not "
234-
"provided, the password will need to be provided by prompt."
235-
),
236-
)
187+
add_common_args(parser)
237188
parser.add_argument(
238189
"--download-name",
239190
type=str,
@@ -276,12 +227,7 @@ def monitor(
276227
)
277228
args = parser.parse_args()
278229

279-
if args.password_file is None:
280-
password = getpass()
281-
else:
282-
with open(args.password_file) as f:
283-
password = f.readline().strip()
284-
230+
password = get_password()
285231
session_id = login(
286232
base_url=args.url,
287233
authenticator=args.authenticator,

0 commit comments

Comments
 (0)