44# you may not use this file except in compliance with the License.
55# You may obtain a copy of the License at
66#
7- # https://www.apache.org/licenses/LICENSE-2.0
7+ # https://www.apache.org/licenses/LICENSE-2.0
88#
99# Unless required by applicable law or agreed to in writing, software
1010# distributed under the License is distributed on an "AS IS" BASIS,
1111# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212# See the License for the specific language governing permissions and
1313# limitations under the License.
14- """This script downloads data for India NSS Health Ailments from NSS Report No. 556.
15-
16- This script downloads data from the NDAP API, processes it, and saves it as a
17- CSV file.
18-
19- How to run the script:
20- python3 download_script.py
21- """
14+ """This script downloads data for India NSS Health Ailments from NSS Report No. 556."""
2215
2316import json
2417import os
3831 'gs://unresolved_mcf/india_ndap/NDAP_NSS_Health/latest/download_config.json' ,
3932 'Input directory where config files are downloaded.' )
4033
34+ # --- ROBUST PATH RESOLUTION START ---
4135_SCRIPT_PATH = os .path .dirname (os .path .abspath (__file__ ))
42- sys .path .append (os .path .join (_SCRIPT_PATH , '../../../util/' ))
43- from download_util_script import _retry_method
44- import file_util
36+
37+ def _add_util_to_path ():
38+ """Adds the repo 'util' directory to sys.path dynamically."""
39+ path_3_up = os .path .abspath (os .path .join (_SCRIPT_PATH , '../../../util/' ))
40+ path_2_up = os .path .abspath (os .path .join (_SCRIPT_PATH , '../../util/' ))
41+
42+ if os .path .exists (os .path .join (path_3_up , 'file_util.py' )):
43+ sys .path .append (path_3_up )
44+ elif os .path .exists (os .path .join (path_2_up , 'file_util.py' )):
45+ sys .path .append (path_2_up )
46+ else :
47+ curr = _SCRIPT_PATH
48+ for _ in range (5 ):
49+ potential = os .path .join (curr , 'util' )
50+ if os .path .exists (os .path .join (potential , 'file_util.py' )):
51+ sys .path .append (potential )
52+ return
53+ curr = os .path .dirname (curr )
54+ logging .error ("Could not find 'util' directory containing file_util.py" )
55+
56+ _add_util_to_path ()
57+
58+ try :
59+ from download_util_script import _retry_method
60+ import file_util
61+ except ImportError as e :
62+ logging .fatal (f"Import Error: Could not find utility scripts. { e } " )
63+ # --- ROBUST PATH RESOLUTION END ---
64+
4565_OUTPUT_COLUMNS = [
4666 'srcStateName' ,
4767 'TRU' ,
5878 'Year' ,
5979]
6080
81+ def load_config (path : str ) -> dict :
82+ """Loads configuration from GCS or local disk."""
83+ if path .startswith ('gs://' ):
84+ client = storage .Client ()
85+ bucket_name = path .split ('/' )[2 ]
86+ blob_name = '/' .join (path .split ('/' )[3 :])
87+ blob = client .get_bucket (bucket_name ).blob (blob_name )
88+ return json .loads (blob .download_as_string ())
89+ return file_util .file_load_py_dict (path )
6190
6291def download_data (config_file_path : str ) -> Tuple [List [Tuple ], str ]:
63- """Downloads and returns raw JSON data from a paginated API.
64-
65- Args:
66- config_file_path: The GCS path to the config file.
67-
68- Returns:
69- A tuple containing the downloaded data as a list of tuples and the output
70- directory.
71- """
72- file_config = file_util .file_load_py_dict (config_file_path )
73- url = file_config .get ('url' )
74- output_dir = file_config .get ('input_files' )
75- if not url :
76- return [], ''
77-
78- all_data = []
79- page_num = 1
80- while True :
81- api_url = f'{ url } &pageno={ page_num } '
82- response = _retry_method (api_url , None , 3 , 5 , 2 )
83- if not response :
84- logging .fatal ('Failed to retrieve data from page %d' , page_num )
85-
86- try :
87- response_data = response .json ()
88- except json .JSONDecodeError :
89- logging .error ('Failed to parse JSON from page %d' , page_num )
90- break
91-
92- if response_data and 'Data' in response_data and response_data ['Data' ]:
93- for item in response_data ['Data' ]:
94- year = item ['Year' ].split (',' )[- 1 ].strip ()
95- row = (
96- item ['StateName' ],
97- item ['TRU' ],
98- item ['D7300_3' ],
99- item ['D7300_4' ],
100- item ['D7300_5' ],
101- item ['I7300_6' ]['TotalPopulationWeight' ],
102- item ['I7300_7' ]['avg' ],
103- item ['I7300_8' ]['avg' ],
104- year ,
105- str (int (year ) + 1 ),
106- year ,
107- item ['Year' ],
108- )
109- all_data .append (row )
110- page_num += 1
111- else :
112- logging .info ('No more data found on page %d.' , page_num )
113- break
114-
115- return all_data , output_dir
116-
92+ """Downloads data using configuration."""
93+ file_config = load_config (config_file_path )
94+ url = file_config .get ('url' )
95+ # We ignore the 'input_files' from config to save in the current directory
96+ output_dir = ''
97+
98+ if not url :
99+ return [], ''
100+
101+ all_data = []
102+ page_num = 1
103+ while True :
104+ api_url = f'{ url } &pageno={ page_num } '
105+ response = _retry_method (api_url , None , 3 , 5 , 2 )
106+ if not response :
107+ logging .fatal ('Failed to retrieve data from page %d' , page_num )
108+
109+ try :
110+ response_data = response .json ()
111+ except json .JSONDecodeError :
112+ logging .error ('Failed to parse JSON from page %d' , page_num )
113+ break
114+
115+ if response_data and 'Data' in response_data and response_data ['Data' ]:
116+ for item in response_data ['Data' ]:
117+ year = item ['Year' ].split (',' )[- 1 ].strip ()
118+ row = (
119+ item ['StateName' ],
120+ item ['TRU' ],
121+ item ['D7300_3' ],
122+ item ['D7300_4' ],
123+ item ['D7300_5' ],
124+ item ['I7300_6' ]['TotalPopulationWeight' ],
125+ item ['I7300_7' ]['avg' ],
126+ item ['I7300_8' ]['avg' ],
127+ year ,
128+ str (int (year ) + 1 ),
129+ year ,
130+ item ['Year' ],
131+ )
132+ all_data .append (row )
133+ page_num += 1
134+ else :
135+ logging .info ('No more data found on page %d.' , page_num )
136+ break
137+
138+ return all_data , output_dir
117139
118140def preprocess_and_save (data : List [Tuple ], output_dir : str ) -> None :
141+ """Saves data to CSV directly in the script directory."""
142+ if not data :
143+ logging .info ('No data was retrieved from the API.' )
144+ return
145+
146+ df = pd .DataFrame (data , columns = _OUTPUT_COLUMNS )
147+
148+ # Save directly in _SCRIPT_PATH (statvar_imports/india_ndap/india_nss_health_ailments)
149+ output_path = os .path .join (_SCRIPT_PATH , 'india_nss_health_ailments.csv' )
150+ df .to_csv (output_path , index = False )
151+ logging .info ('Data saved to %s' , output_path )
119152 """Converts data to a DataFrame and saves it as a CSV file.
120153
121154 Args:
@@ -135,11 +168,9 @@ def preprocess_and_save(data: List[Tuple], output_dir: str) -> None:
135168
136169
137170def main (_ ) -> None :
138- """Main function to download, process, and save the data."""
139- raw_data , _ = download_data (_FLAGS .config_file_path )
140- if raw_data :
141- preprocess_and_save (raw_data , _SCRIPT_PATH )
142-
171+ raw_data , output_dir = download_data (_FLAGS .config_file_path )
172+ if raw_data :
173+ preprocess_and_save (raw_data , output_dir )
143174
144175if __name__ == '__main__' :
145- app .run (main )
176+ app .run (main )
0 commit comments