11import os
22import re
3+ import sys
34import json
45import shutil
6+ import logging
57import requests
68
79from typing import Optional , Dict
1820 'extract_schemas' , 'CLOUDFLARE_DOCS_URL' , 'CLOUDFLARE_SCHEMAS_URL' ,
1921]
2022
23+ log = logging .getLogger (__name__ )
24+ log .setLevel (logging .INFO )
25+ handler = logging .StreamHandler (sys .stdout )
26+ handler .setFormatter (logging .Formatter ('[%(asctime)s] [%(levelname)s] %(message)s' ))
27+ log .addHandler (handler )
28+
2129CLOUDFLARE_DOCS_URL = os .environ .get ('NX_CLOUDFLARE_DOCS_URL' , 'https://api.cloudflare.com/' )
2230CLOUDFLARE_SCHEMAS_URL = os .environ .get ('NX_CLOUDFLARE_SCHEMAS_URL' , f'{ CLOUDFLARE_DOCS_URL } schemas/v4/' )
2331
@@ -31,23 +39,30 @@ def _fetch_text(
3139 cache_path : Optional [str ] = None ,
3240 invalidate : bool = False
3341) -> str :
42+ log .info ('Fetching %s' , url )
3443 if cache_path and not invalidate :
3544 if os .path .exists (cache_path ):
45+ log .info ('Reading cache from %s' , cache_path )
3646 with open (cache_path ) as cache_file :
3747 return cache_file .read ()
3848 plain_text = requests .get (url ).text
3949 if cache_path :
50+ log .info ('Writing cache to %s' , cache_path )
4051 with open (cache_path , 'w' ) as cache_file :
4152 cache_file .write (plain_text )
4253 return plain_text
4354
4455
4556def _process_docs (text : str , * , base_url : str ) -> Dict :
57+ log .info ('Processing HTML docs...' )
4658 tree = etree .fromstring (text , parser = etree .HTMLParser (huge_tree = True , remove_comments = True ))
47- return {
59+ result = {
4860 'sections' : tree .xpath ('//article[contains(@class, "api-section")]/header/h2/text()' ),
4961 'app_url' : urljoin (base_url , tree .xpath ('//script[contains(@src, "apidocs-static/app-")]/@src' )[0 ]),
5062 }
63+ log .info ('Sections found: %d' , len (result ['sections' ]))
64+ log .info ('App bundle URL: %s' , result ['app_url' ])
65+ return result
5166
5267
5368class _PyVisitor (NodeVisitor ):
@@ -86,6 +101,7 @@ def visit_dict(self, obj):
86101
87102
88103def _process_app (code : str ) -> Dict :
104+ log .info ('Processing app bundle...' )
89105 result = {}
90106 visitor , obj_pos = _PyVisitor (), 0
91107 while section := RE_APP_SECTION .search (code , pos = obj_pos ):
@@ -99,6 +115,7 @@ def _process_app(code: str) -> Dict:
99115 .replace ('/' , '.' )
100116 .replace ('-' , '_' )
101117 )
118+ log .debug ('Found %s' , py_obj ['id' ])
102119 result [py_id ] = py_obj
103120 return result
104121
@@ -112,6 +129,7 @@ def extract_schemas(
112129 app_url : Optional [str ] = None ,
113130 base_url : Optional [str ] = None ,
114131) -> Dict :
132+ log .info ('Extracting schemas...' )
115133 docs_info = {}
116134 if not app_url :
117135 base_url = base_url or CLOUDFLARE_DOCS_URL
@@ -120,6 +138,7 @@ def extract_schemas(
120138
121139 api_schemas = _process_app (_fetch_text (app_url ))
122140 if verify and docs_info :
141+ log .info ('Checking integrity...' )
123142 # TODO: Do we have additional ways to verify the integrity?
124143 assert len (docs_info ['sections' ]) == len (api_schemas )
125144
@@ -132,11 +151,14 @@ def extract_schemas(
132151 if output_path :
133152 output_path = Path (output_path )
134153 if remove_existing and output_path .exists ():
154+ log .info ('Deleting directory %s (--remove-existing)' , output_path )
135155 shutil .rmtree (output_path )
136156
157+ log .info ('Serializing schemas to %s' , output_path )
137158 for _ , schema in api_schemas .items ():
138159 file_path = output_path / schema ['id' ].replace (CLOUDFLARE_SCHEMAS_URL , '' )
139160 os .makedirs (file_path .parent , exist_ok = True )
161+ log .debug ('Writing %s' , file_path )
140162 with open (file_path , 'w' ) as json_file :
141163 json .dump ({
142164 ** schema ,
@@ -148,6 +170,10 @@ def extract_schemas(
148170
149171
150172if __name__ == '__main__' :
151- all_schemas = extract_schemas (output_path = '../schemas/' , remove_existing = True )
152- with open ('../schemas/schemas.json' , 'w' ) as schemas_file :
173+ schemas_dir = Path (__file__ ).resolve ().parent .parent / 'schemas'
174+ all_schemas = extract_schemas (output_path = schemas_dir , remove_existing = True )
175+
176+ schemas_path = schemas_dir / 'schemas.json'
177+ log .info ('Serializing registry to %s' , schemas_path )
178+ with open (schemas_path , 'w' ) as schemas_file :
153179 json .dump (all_schemas , schemas_file , indent = 4 )
0 commit comments