Skip to content

Commit bf09751

Browse files
This update addresses the "Subprocess failed" error encountered during the automated data import by improving path resolution and configuration loading. (#1848)
* fixed download script for india_nss_health_ailments * fixed download script for india_nss_health_ailments also to locate util/download_util_script.py
1 parent 98d3f3d commit bf09751

File tree

1 file changed

+103
-72
lines changed

1 file changed

+103
-72
lines changed

statvar_imports/india_ndap/india_nss_health_ailments/download_script.py

Lines changed: 103 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,14 @@
44
# you may not use this file except in compliance with the License.
55
# You may obtain a copy of the License at
66
#
7-
# https://www.apache.org/licenses/LICENSE-2.0
7+
# https://www.apache.org/licenses/LICENSE-2.0
88
#
99
# Unless required by applicable law or agreed to in writing, software
1010
# distributed under the License is distributed on an "AS IS" BASIS,
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
"""This script downloads data for India NSS Health Ailments from NSS Report No. 556.
15-
16-
This script downloads data from the NDAP API, processes it, and saves it as a
17-
CSV file.
18-
19-
How to run the script:
20-
python3 download_script.py
21-
"""
14+
"""This script downloads data for India NSS Health Ailments from NSS Report No. 556."""
2215

2316
import json
2417
import os
@@ -38,10 +31,37 @@
3831
'gs://unresolved_mcf/india_ndap/NDAP_NSS_Health/latest/download_config.json',
3932
'Input directory where config files are downloaded.')
4033

34+
# --- ROBUST PATH RESOLUTION START ---
4135
_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
42-
sys.path.append(os.path.join(_SCRIPT_PATH, '../../../util/'))
43-
from download_util_script import _retry_method
44-
import file_util
36+
37+
def _add_util_to_path():
38+
"""Adds the repo 'util' directory to sys.path dynamically."""
39+
path_3_up = os.path.abspath(os.path.join(_SCRIPT_PATH, '../../../util/'))
40+
path_2_up = os.path.abspath(os.path.join(_SCRIPT_PATH, '../../util/'))
41+
42+
if os.path.exists(os.path.join(path_3_up, 'file_util.py')):
43+
sys.path.append(path_3_up)
44+
elif os.path.exists(os.path.join(path_2_up, 'file_util.py')):
45+
sys.path.append(path_2_up)
46+
else:
47+
curr = _SCRIPT_PATH
48+
for _ in range(5):
49+
potential = os.path.join(curr, 'util')
50+
if os.path.exists(os.path.join(potential, 'file_util.py')):
51+
sys.path.append(potential)
52+
return
53+
curr = os.path.dirname(curr)
54+
logging.error("Could not find 'util' directory containing file_util.py")
55+
56+
_add_util_to_path()
57+
58+
try:
59+
from download_util_script import _retry_method
60+
import file_util
61+
except ImportError as e:
62+
logging.fatal(f"Import Error: Could not find utility scripts. {e}")
63+
# --- ROBUST PATH RESOLUTION END ---
64+
4565
_OUTPUT_COLUMNS = [
4666
'srcStateName',
4767
'TRU',
@@ -58,64 +78,77 @@
5878
'Year',
5979
]
6080

81+
def load_config(path: str) -> dict:
82+
"""Loads configuration from GCS or local disk."""
83+
if path.startswith('gs://'):
84+
client = storage.Client()
85+
bucket_name = path.split('/')[2]
86+
blob_name = '/'.join(path.split('/')[3:])
87+
blob = client.get_bucket(bucket_name).blob(blob_name)
88+
return json.loads(blob.download_as_string())
89+
return file_util.file_load_py_dict(path)
6190

6291
def download_data(config_file_path: str) -> Tuple[List[Tuple], str]:
63-
"""Downloads and returns raw JSON data from a paginated API.
64-
65-
Args:
66-
config_file_path: The GCS path to the config file.
67-
68-
Returns:
69-
A tuple containing the downloaded data as a list of tuples and the output
70-
directory.
71-
"""
72-
file_config = file_util.file_load_py_dict(config_file_path)
73-
url = file_config.get('url')
74-
output_dir = file_config.get('input_files')
75-
if not url:
76-
return [], ''
77-
78-
all_data = []
79-
page_num = 1
80-
while True:
81-
api_url = f'{url}&pageno={page_num}'
82-
response = _retry_method(api_url, None, 3, 5, 2)
83-
if not response:
84-
logging.fatal('Failed to retrieve data from page %d', page_num)
85-
86-
try:
87-
response_data = response.json()
88-
except json.JSONDecodeError:
89-
logging.error('Failed to parse JSON from page %d', page_num)
90-
break
91-
92-
if response_data and 'Data' in response_data and response_data['Data']:
93-
for item in response_data['Data']:
94-
year = item['Year'].split(',')[-1].strip()
95-
row = (
96-
item['StateName'],
97-
item['TRU'],
98-
item['D7300_3'],
99-
item['D7300_4'],
100-
item['D7300_5'],
101-
item['I7300_6']['TotalPopulationWeight'],
102-
item['I7300_7']['avg'],
103-
item['I7300_8']['avg'],
104-
year,
105-
str(int(year) + 1),
106-
year,
107-
item['Year'],
108-
)
109-
all_data.append(row)
110-
page_num += 1
111-
else:
112-
logging.info('No more data found on page %d.', page_num)
113-
break
114-
115-
return all_data, output_dir
116-
92+
"""Downloads data using configuration."""
93+
file_config = load_config(config_file_path)
94+
url = file_config.get('url')
95+
# We ignore the 'input_files' from config to save in the current directory
96+
output_dir = ''
97+
98+
if not url:
99+
return [], ''
100+
101+
all_data = []
102+
page_num = 1
103+
while True:
104+
api_url = f'{url}&pageno={page_num}'
105+
response = _retry_method(api_url, None, 3, 5, 2)
106+
if not response:
107+
logging.fatal('Failed to retrieve data from page %d', page_num)
108+
109+
try:
110+
response_data = response.json()
111+
except json.JSONDecodeError:
112+
logging.error('Failed to parse JSON from page %d', page_num)
113+
break
114+
115+
if response_data and 'Data' in response_data and response_data['Data']:
116+
for item in response_data['Data']:
117+
year = item['Year'].split(',')[-1].strip()
118+
row = (
119+
item['StateName'],
120+
item['TRU'],
121+
item['D7300_3'],
122+
item['D7300_4'],
123+
item['D7300_5'],
124+
item['I7300_6']['TotalPopulationWeight'],
125+
item['I7300_7']['avg'],
126+
item['I7300_8']['avg'],
127+
year,
128+
str(int(year) + 1),
129+
year,
130+
item['Year'],
131+
)
132+
all_data.append(row)
133+
page_num += 1
134+
else:
135+
logging.info('No more data found on page %d.', page_num)
136+
break
137+
138+
return all_data, output_dir
117139

118140
def preprocess_and_save(data: List[Tuple], output_dir: str) -> None:
141+
"""Saves data to CSV directly in the script directory."""
142+
if not data:
143+
logging.info('No data was retrieved from the API.')
144+
return
145+
146+
df = pd.DataFrame(data, columns=_OUTPUT_COLUMNS)
147+
148+
# Save directly in _SCRIPT_PATH (statvar_imports/india_ndap/india_nss_health_ailments)
149+
output_path = os.path.join(_SCRIPT_PATH, 'india_nss_health_ailments.csv')
150+
df.to_csv(output_path, index=False)
151+
logging.info('Data saved to %s', output_path)
119152
"""Converts data to a DataFrame and saves it as a CSV file.
120153
121154
Args:
@@ -135,11 +168,9 @@ def preprocess_and_save(data: List[Tuple], output_dir: str) -> None:
135168

136169

137170
def main(_) -> None:
138-
"""Main function to download, process, and save the data."""
139-
raw_data, _ = download_data(_FLAGS.config_file_path)
140-
if raw_data:
141-
preprocess_and_save(raw_data, _SCRIPT_PATH)
142-
171+
raw_data, output_dir = download_data(_FLAGS.config_file_path)
172+
if raw_data:
173+
preprocess_and_save(raw_data, output_dir)
143174

144175
if __name__ == '__main__':
145-
app.run(main)
176+
app.run(main)

0 commit comments

Comments
 (0)