diff --git a/metadata-ingestion/docs/sources/openapi/openapi.md b/metadata-ingestion/docs/sources/openapi/openapi.md index b3231b018bdde..b661d5e14568d 100644 --- a/metadata-ingestion/docs/sources/openapi/openapi.md +++ b/metadata-ingestion/docs/sources/openapi/openapi.md @@ -2,11 +2,11 @@ The dataset metadata should be defined directly in the Swagger file, section `[" ## Capabilities -The plugin read the swagger file where the endopints are defined and searches for the ones which accept -a `GET` call: those are the ones supposed to give back the datasets. +This plugin reads the swagger file where the endpoints are defined, reads example data if provided (for any method), or searches for +data for the endpoints which do not have example data and accept a `GET` call. For every selected endpoint defined in the `paths` section, -the tool searches whether the medatada are already defined in there. +the tool searches whether the metadata are already defined. As example, if in your swagger file there is the `/api/users/` defined as follows: ```yaml @@ -27,7 +27,7 @@ paths: then this plugin has all the information needed to create the dataset in DataHub. -In case there is no example defined, the plugin will try to get the metadata directly from the endpoint. +In case there is no example defined, the plugin will try to get the metadata directly from the endpoint, if it is a `GET` method. So, if in your swagger file you have ```yaml @@ -42,7 +42,7 @@ paths: description: Return the list of colors ``` -the tool will make a `GET` call to `https:///test_endpoint.com/colors` +the tool will make a `GET` call to `https://test_endpoint.com/colors` and parse the response obtained. ### Automatically recorded examples @@ -53,7 +53,7 @@ Sometimes you can have an endpoint which wants a parameter to work, like Since in the OpenApi specifications the listing endpoints are specified just before the detailed ones, in the list of the paths, you will find - https:///test_endpoint.com/colors + https://test_endpoint.com/colors defined before @@ -80,7 +80,7 @@ and this last URL will be called to get back the needed metadata. If no useful example is found, a second procedure will try to guess a numerical ID. So if we have: - https:///test_endpoint.com/colors/{colorID} + https://test_endpoint.com/colors/{colorID} and there is no `colorID` example already found by the plugin, it will try to put a number one (1) at the parameter place @@ -120,8 +120,8 @@ paths: description: Return details about the group ``` -and the plugin did not found an example in its previous calls, -so the tool have no idea about what substitute to the `{name}` part. +and the plugin did not find an example in its previous calls, +the tool has no idea about what to substitute for the `{name}` part. By specifying in the configuration file diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi.py b/metadata-ingestion/src/datahub/ingestion/source/openapi.py index 78570a2a4ceca..42924a09a39e9 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi.py @@ -246,6 +246,12 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]: # noqa: C901 schema_metadata = set_metadata(dataset_name, endpoint_dets["data"]) dataset_snapshot.aspects.append(schema_metadata) yield self.build_wu(dataset_snapshot, dataset_name) + elif endpoint_dets["method"] != "get": + self.report.report_warning( + key=endpoint_k, + reason=f"No example provided for {endpoint_dets['method']}", + ) + continue # Only test endpoints if they're GETs elif ( "{" not in endpoint_k ): # if the API does not explicitly require parameters diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py index 14597618a9d51..1ab40bc8be73d 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py @@ -89,10 +89,13 @@ def get_swag_json( def get_url_basepath(sw_dict: dict) -> str: - try: + if "basePath" in sw_dict: return sw_dict["basePath"] - except KeyError: # no base path defined - return "" + if "servers" in sw_dict: + # When the API path doesn't match the OAS path + return sw_dict["servers"][0]["url"] + + return "" def check_sw_version(sw_dict: dict) -> None: @@ -111,74 +114,80 @@ def check_sw_version(sw_dict: dict) -> None: def get_endpoints(sw_dict: dict) -> dict: # noqa: C901 """ - Get all the URLs accepting the "GET" method, together with their description and the tags + Get all the URLs, together with their description and the tags """ url_details = {} check_sw_version(sw_dict) for p_k, p_o in sw_dict["paths"].items(): - # will track only the "get" methods, which are the ones that give us data - if "get" in p_o.keys(): - if "200" in p_o["get"]["responses"].keys(): - base_res = p_o["get"]["responses"]["200"] - elif 200 in p_o["get"]["responses"].keys(): - # if you read a plain yml file the 200 will be an integer - base_res = p_o["get"]["responses"][200] - else: - # the endpoint does not have a 200 response - continue - - if "description" in p_o["get"].keys(): - desc = p_o["get"]["description"] - elif "summary" in p_o["get"].keys(): - desc = p_o["get"]["summary"] - else: # still testing - desc = "" - - try: - tags = p_o["get"]["tags"] - except KeyError: - tags = [] - - url_details[p_k] = {"description": desc, "tags": tags} - - # trying if dataset is defined in swagger... - if "content" in base_res.keys(): - res_cont = base_res["content"] - if "application/json" in res_cont.keys(): - ex_field = None - if "example" in res_cont["application/json"]: - ex_field = "example" - elif "examples" in res_cont["application/json"]: - ex_field = "examples" - - if ex_field: - if isinstance(res_cont["application/json"][ex_field], dict): - url_details[p_k]["data"] = res_cont["application/json"][ - ex_field - ] - elif isinstance(res_cont["application/json"][ex_field], list): - # taking the first example - url_details[p_k]["data"] = res_cont["application/json"][ - ex_field - ][0] - else: - logger.warning( - f"Field in swagger file does not give consistent data --- {p_k}" - ) - elif "text/csv" in res_cont.keys(): - url_details[p_k]["data"] = res_cont["text/csv"]["schema"] - elif "examples" in base_res.keys(): - url_details[p_k]["data"] = base_res["examples"]["application/json"] - - # checking whether there are defined parameters to execute the call... - if "parameters" in p_o["get"].keys(): - url_details[p_k]["parameters"] = p_o["get"]["parameters"] + method = list(p_o)[0] + if "200" in p_o[method]["responses"].keys(): + base_res = p_o[method]["responses"]["200"] + elif 200 in p_o[method]["responses"].keys(): + # if you read a plain yml file the 200 will be an integer + base_res = p_o[method]["responses"][200] + else: + # the endpoint does not have a 200 response + continue + + if "description" in p_o[method].keys(): + desc = p_o[method]["description"] + elif "summary" in p_o[method].keys(): + desc = p_o[method]["summary"] + else: # still testing + desc = "" + + try: + tags = p_o[method]["tags"] + except KeyError: + tags = [] + + url_details[p_k] = {"description": desc, "tags": tags, "method": method} + + example_data = check_for_api_example_data(base_res, p_k) + if example_data: + url_details[p_k]["data"] = example_data + + # checking whether there are defined parameters to execute the call... + if "parameters" in p_o[method].keys(): + url_details[p_k]["parameters"] = p_o[method]["parameters"] return dict(sorted(url_details.items())) +def check_for_api_example_data(base_res: dict, key: str) -> dict: + """ + Try to determine if example data is defined for the endpoint, and return it + """ + data = {} + if "content" in base_res.keys(): + res_cont = base_res["content"] + if "application/json" in res_cont.keys(): + ex_field = None + if "example" in res_cont["application/json"]: + ex_field = "example" + elif "examples" in res_cont["application/json"]: + ex_field = "examples" + + if ex_field: + if isinstance(res_cont["application/json"][ex_field], dict): + data = res_cont["application/json"][ex_field] + elif isinstance(res_cont["application/json"][ex_field], list): + # taking the first example + data = res_cont["application/json"][ex_field][0] + else: + logger.warning( + f"Field in swagger file does not give consistent data --- {key}" + ) + elif "text/csv" in res_cont.keys(): + data = res_cont["text/csv"]["schema"] + elif "examples" in base_res.keys(): + data = base_res["examples"]["application/json"] + + return data + + def guessing_url_name(url: str, examples: dict) -> str: """ given a url and dict of extracted data, we try to guess a working URL. Example: @@ -314,12 +323,10 @@ def extract_fields( return ["contains_a_string"], {"contains_a_string": dict_data[0]} else: raise ValueError("unknown format") - if len(dict_data.keys()) > 1: + if len(dict_data) > 1: # the elements are directly inside the dict return flatten2list(dict_data), dict_data - dst_key = list(dict_data.keys())[ - 0 - ] # the first and unique key is the dataset's name + dst_key = list(dict_data)[0] # the first and unique key is the dataset's name try: return flatten2list(dict_data[dst_key]), dict_data[dst_key] diff --git a/metadata-ingestion/tests/unit/test_openapi.py b/metadata-ingestion/tests/unit/test_openapi.py index 64edd7fab2158..42a8a13c1943d 100644 --- a/metadata-ingestion/tests/unit/test_openapi.py +++ b/metadata-ingestion/tests/unit/test_openapi.py @@ -176,6 +176,115 @@ class TestGetEndpoints(unittest.TestCase): ] } } + post: + operationId: updateVersionDetailsv2 + summary: Update API version details + produces: + - application/json + responses: + "200": + description: |- + 200 203 response + examples: + application/json: |- + { + "version": { + "status": "CURRENT", + "updated": "2011-01-21T11:33:21Z", + "media-types": [ + { + "base": "application/xml", + "type": "application/vnd.openstack.compute+xml;version=2" + }, + { + "base": "application/json", + "type": "application/vnd.openstack.compute+json;version=2" + } + ], + "id": "v2.0", + "links": [ + { + "href": "http://127.0.0.1:8774/v2/", + "rel": "self" + }, + { + "href": "http://docs.openstack.org/api/openstack-compute/2/os-compute-devguide-2.pdf", + "type": "application/pdf", + "rel": "describedby" + }, + { + "href": "http://docs.openstack.org/api/openstack-compute/2/wadl/os-compute-2.wadl", + "type": "application/vnd.sun.wadl+xml", + "rel": "describedby" + }, + { + "href": "http://docs.openstack.org/api/openstack-compute/2/wadl/os-compute-2.wadl", + "type": "application/vnd.sun.wadl+xml", + "rel": "describedby" + } + ] + } + } + /v2/updateNoExample: + post: + operationId: updateVersionDetailsNoExample + summary: Show API version details no example output + produces: + - application/json + responses: + "200": + description: |- + 200 203 response + /v2/update: + post: + operationId: updateVersionDetailsv2 + summary: Show API version details + produces: + - application/json + responses: + "200": + description: |- + 200 203 response + examples: + application/json: |- + { + "version": { + "status": "CURRENT", + "updated": "2011-01-21T11:33:21Z", + "media-types": [ + { + "base": "application/xml", + "type": "application/vnd.openstack.compute+xml;version=2" + }, + { + "base": "application/json", + "type": "application/vnd.openstack.compute+json;version=2" + } + ], + "id": "v2.0", + "links": [ + { + "href": "http://127.0.0.1:8774/v2/", + "rel": "self" + }, + { + "href": "http://docs.openstack.org/api/openstack-compute/2/os-compute-devguide-2.pdf", + "type": "application/pdf", + "rel": "describedby" + }, + { + "href": "http://docs.openstack.org/api/openstack-compute/2/wadl/os-compute-2.wadl", + "type": "application/vnd.sun.wadl+xml", + "rel": "describedby" + }, + { + "href": "http://docs.openstack.org/api/openstack-compute/2/wadl/os-compute-2.wadl", + "type": "application/vnd.sun.wadl+xml", + "rel": "describedby" + } + ] + } + } consumes: - application/json """ @@ -359,6 +468,65 @@ class TestGetEndpoints(unittest.TestCase): ] } } + /v2/updateNoExample: + post: + operationId: updateVersionDetailsNoExample + summary: Update API version details + responses: + '200': + description: |- + 200 response + /v2/update: + post: + operationId: updateVersionDetailsv2 + summary: Update API version details + responses: + '200': + description: |- + 200 response + content: + application/json: + examples: + foo: + value: + { + "version": { + "status": "CURRENT", + "updated": "2011-01-21T11:33:21Z", + "media-types": [ + { + "base": "application/xml", + "type": "application/vnd.openstack.compute+xml;version=2" + }, + { + "base": "application/json", + "type": "application/vnd.openstack.compute+json;version=2" + } + ], + "id": "v2.0", + "links": [ + { + "href": "http://127.0.0.1:8774/v2/", + "rel": "self" + }, + { + "href": "http://docs.openstack.org/api/openstack-compute/2/os-compute-devguide-2.pdf", + "type": "application/pdf", + "rel": "describedby" + }, + { + "href": "http://docs.openstack.org/api/openstack-compute/2/wadl/os-compute-2.wadl", + "type": "application/vnd.sun.wadl+xml", + "rel": "describedby" + }, + { + "href": "http://docs.openstack.org/api/openstack-compute/2/wadl/os-compute-2.wadl", + "type": "application/vnd.sun.wadl+xml", + "rel": "describedby" + } + ] + } + } """ def test_get_endpoints_openapi30(self) -> None: @@ -366,19 +534,25 @@ def test_get_endpoints_openapi30(self) -> None: sw_file_raw = yaml.safe_load(self.openapi30) url_endpoints = get_endpoints(sw_file_raw) - self.assertEqual(len(url_endpoints), 2) - d4k = {"data": "", "tags": "", "description": ""} + self.assertEqual(len(url_endpoints), 4) + d4k = {"data": "", "tags": "", "description": "", "method": ""} self.assertEqual(url_endpoints["/"].keys(), d4k.keys()) + self.assertIn("data", url_endpoints["/v2/update"]) + self.assertNotIn("data", url_endpoints["/v2/updateNoExample"]) + def test_get_endpoints_openapi20(self) -> None: """extracting 'get' type endpoints from swagger 2.0 file""" sw_file_raw = yaml.safe_load(self.openapi20) url_endpoints = get_endpoints(sw_file_raw) - self.assertEqual(len(url_endpoints), 2) - d4k = {"data": "", "tags": "", "description": ""} + self.assertEqual(len(url_endpoints), 4) + d4k = {"data": "", "tags": "", "description": "", "method": ""} self.assertEqual(url_endpoints["/"].keys(), d4k.keys()) + self.assertIn("data", url_endpoints["/v2/update"]) + self.assertNotIn("data", url_endpoints["/v2/updateNoExample"]) + class TestExplodeDict(unittest.TestCase): def test_d1(self):