Skip to content

Commit 4a07cbd

Browse files
jsmilksteinyoonhyejin
authored andcommitted
feat(ingest): add ability to read other method types than GET for OAS ingest recipes (#8303)
1 parent bd636fc commit 4a07cbd

File tree

4 files changed

+265
-78
lines changed

4 files changed

+265
-78
lines changed

metadata-ingestion/docs/sources/openapi/openapi.md

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@ The dataset metadata should be defined directly in the Swagger file, section `["
22

33
## Capabilities
44

5-
The plugin read the swagger file where the endopints are defined and searches for the ones which accept
6-
a `GET` call: those are the ones supposed to give back the datasets.
5+
This plugin reads the swagger file where the endpoints are defined, reads example data if provided (for any method), or searches for
6+
data for the endpoints which do not have example data and accept a `GET` call.
77

88
For every selected endpoint defined in the `paths` section,
9-
the tool searches whether the medatada are already defined in there.
9+
the tool searches whether the metadata are already defined.
1010
As example, if in your swagger file there is the `/api/users/` defined as follows:
1111

1212
```yaml
@@ -27,7 +27,7 @@ paths:
2727

2828
then this plugin has all the information needed to create the dataset in DataHub.
2929

30-
In case there is no example defined, the plugin will try to get the metadata directly from the endpoint.
30+
In case there is no example defined, the plugin will try to get the metadata directly from the endpoint, if it is a `GET` method.
3131
So, if in your swagger file you have
3232

3333
```yaml
@@ -42,7 +42,7 @@ paths:
4242
description: Return the list of colors
4343
```
4444
45-
the tool will make a `GET` call to `https:///test_endpoint.com/colors`
45+
the tool will make a `GET` call to `https://test_endpoint.com/colors`
4646
and parse the response obtained.
4747

4848
### Automatically recorded examples
@@ -53,7 +53,7 @@ Sometimes you can have an endpoint which wants a parameter to work, like
5353
Since in the OpenApi specifications the listing endpoints are specified
5454
just before the detailed ones, in the list of the paths, you will find
5555

56-
https:///test_endpoint.com/colors
56+
https://test_endpoint.com/colors
5757

5858
defined before
5959

@@ -80,7 +80,7 @@ and this last URL will be called to get back the needed metadata.
8080
If no useful example is found, a second procedure will try to guess a numerical ID.
8181
So if we have:
8282
83-
https:///test_endpoint.com/colors/{colorID}
83+
https://test_endpoint.com/colors/{colorID}
8484
8585
and there is no `colorID` example already found by the plugin,
8686
it will try to put a number one (1) at the parameter place
@@ -120,8 +120,8 @@ paths:
120120
description: Return details about the group
121121
```
122122

123-
and the plugin did not found an example in its previous calls,
124-
so the tool have no idea about what substitute to the `{name}` part.
123+
and the plugin did not find an example in its previous calls,
124+
the tool has no idea about what to substitute for the `{name}` part.
125125

126126
By specifying in the configuration file
127127

metadata-ingestion/src/datahub/ingestion/source/openapi.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,12 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]: # noqa: C901
246246
schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
247247
dataset_snapshot.aspects.append(schema_metadata)
248248
yield self.build_wu(dataset_snapshot, dataset_name)
249+
elif endpoint_dets["method"] != "get":
250+
self.report.report_warning(
251+
key=endpoint_k,
252+
reason=f"No example provided for {endpoint_dets['method']}",
253+
)
254+
continue # Only test endpoints if they're GETs
249255
elif (
250256
"{" not in endpoint_k
251257
): # if the API does not explicitly require parameters

metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py

Lines changed: 72 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,13 @@ def get_swag_json(
8989

9090

9191
def get_url_basepath(sw_dict: dict) -> str:
92-
try:
92+
if "basePath" in sw_dict:
9393
return sw_dict["basePath"]
94-
except KeyError: # no base path defined
95-
return ""
94+
if "servers" in sw_dict:
95+
# When the API path doesn't match the OAS path
96+
return sw_dict["servers"][0]["url"]
97+
98+
return ""
9699

97100

98101
def check_sw_version(sw_dict: dict) -> None:
@@ -111,74 +114,80 @@ def check_sw_version(sw_dict: dict) -> None:
111114

112115
def get_endpoints(sw_dict: dict) -> dict: # noqa: C901
113116
"""
114-
Get all the URLs accepting the "GET" method, together with their description and the tags
117+
Get all the URLs, together with their description and the tags
115118
"""
116119
url_details = {}
117120

118121
check_sw_version(sw_dict)
119122

120123
for p_k, p_o in sw_dict["paths"].items():
121-
# will track only the "get" methods, which are the ones that give us data
122-
if "get" in p_o.keys():
123-
if "200" in p_o["get"]["responses"].keys():
124-
base_res = p_o["get"]["responses"]["200"]
125-
elif 200 in p_o["get"]["responses"].keys():
126-
# if you read a plain yml file the 200 will be an integer
127-
base_res = p_o["get"]["responses"][200]
128-
else:
129-
# the endpoint does not have a 200 response
130-
continue
131-
132-
if "description" in p_o["get"].keys():
133-
desc = p_o["get"]["description"]
134-
elif "summary" in p_o["get"].keys():
135-
desc = p_o["get"]["summary"]
136-
else: # still testing
137-
desc = ""
138-
139-
try:
140-
tags = p_o["get"]["tags"]
141-
except KeyError:
142-
tags = []
143-
144-
url_details[p_k] = {"description": desc, "tags": tags}
145-
146-
# trying if dataset is defined in swagger...
147-
if "content" in base_res.keys():
148-
res_cont = base_res["content"]
149-
if "application/json" in res_cont.keys():
150-
ex_field = None
151-
if "example" in res_cont["application/json"]:
152-
ex_field = "example"
153-
elif "examples" in res_cont["application/json"]:
154-
ex_field = "examples"
155-
156-
if ex_field:
157-
if isinstance(res_cont["application/json"][ex_field], dict):
158-
url_details[p_k]["data"] = res_cont["application/json"][
159-
ex_field
160-
]
161-
elif isinstance(res_cont["application/json"][ex_field], list):
162-
# taking the first example
163-
url_details[p_k]["data"] = res_cont["application/json"][
164-
ex_field
165-
][0]
166-
else:
167-
logger.warning(
168-
f"Field in swagger file does not give consistent data --- {p_k}"
169-
)
170-
elif "text/csv" in res_cont.keys():
171-
url_details[p_k]["data"] = res_cont["text/csv"]["schema"]
172-
elif "examples" in base_res.keys():
173-
url_details[p_k]["data"] = base_res["examples"]["application/json"]
174-
175-
# checking whether there are defined parameters to execute the call...
176-
if "parameters" in p_o["get"].keys():
177-
url_details[p_k]["parameters"] = p_o["get"]["parameters"]
124+
method = list(p_o)[0]
125+
if "200" in p_o[method]["responses"].keys():
126+
base_res = p_o[method]["responses"]["200"]
127+
elif 200 in p_o[method]["responses"].keys():
128+
# if you read a plain yml file the 200 will be an integer
129+
base_res = p_o[method]["responses"][200]
130+
else:
131+
# the endpoint does not have a 200 response
132+
continue
133+
134+
if "description" in p_o[method].keys():
135+
desc = p_o[method]["description"]
136+
elif "summary" in p_o[method].keys():
137+
desc = p_o[method]["summary"]
138+
else: # still testing
139+
desc = ""
140+
141+
try:
142+
tags = p_o[method]["tags"]
143+
except KeyError:
144+
tags = []
145+
146+
url_details[p_k] = {"description": desc, "tags": tags, "method": method}
147+
148+
example_data = check_for_api_example_data(base_res, p_k)
149+
if example_data:
150+
url_details[p_k]["data"] = example_data
151+
152+
# checking whether there are defined parameters to execute the call...
153+
if "parameters" in p_o[method].keys():
154+
url_details[p_k]["parameters"] = p_o[method]["parameters"]
178155

179156
return dict(sorted(url_details.items()))
180157

181158

159+
def check_for_api_example_data(base_res: dict, key: str) -> dict:
160+
"""
161+
Try to determine if example data is defined for the endpoint, and return it
162+
"""
163+
data = {}
164+
if "content" in base_res.keys():
165+
res_cont = base_res["content"]
166+
if "application/json" in res_cont.keys():
167+
ex_field = None
168+
if "example" in res_cont["application/json"]:
169+
ex_field = "example"
170+
elif "examples" in res_cont["application/json"]:
171+
ex_field = "examples"
172+
173+
if ex_field:
174+
if isinstance(res_cont["application/json"][ex_field], dict):
175+
data = res_cont["application/json"][ex_field]
176+
elif isinstance(res_cont["application/json"][ex_field], list):
177+
# taking the first example
178+
data = res_cont["application/json"][ex_field][0]
179+
else:
180+
logger.warning(
181+
f"Field in swagger file does not give consistent data --- {key}"
182+
)
183+
elif "text/csv" in res_cont.keys():
184+
data = res_cont["text/csv"]["schema"]
185+
elif "examples" in base_res.keys():
186+
data = base_res["examples"]["application/json"]
187+
188+
return data
189+
190+
182191
def guessing_url_name(url: str, examples: dict) -> str:
183192
"""
184193
given a url and dict of extracted data, we try to guess a working URL. Example:
@@ -314,12 +323,10 @@ def extract_fields(
314323
return ["contains_a_string"], {"contains_a_string": dict_data[0]}
315324
else:
316325
raise ValueError("unknown format")
317-
if len(dict_data.keys()) > 1:
326+
if len(dict_data) > 1:
318327
# the elements are directly inside the dict
319328
return flatten2list(dict_data), dict_data
320-
dst_key = list(dict_data.keys())[
321-
0
322-
] # the first and unique key is the dataset's name
329+
dst_key = list(dict_data)[0] # the first and unique key is the dataset's name
323330

324331
try:
325332
return flatten2list(dict_data[dst_key]), dict_data[dst_key]

0 commit comments

Comments
 (0)