This update addresses the "Subprocess failed" error encountered during the automated data import by improving path resolution and configuration loading. (#1848)

abhishekjaisw · web-flow · commit bf097517d6a8 · 2026-01-23T15:31:45.000+05:30
* fixed download script for india_nss_health_ailments

* fixed download script for india_nss_health_ailments also to locate util/download_util_script.py
diff --git a/statvar_imports/india_ndap/india_nss_health_ailments/download_script.py b/statvar_imports/india_ndap/india_nss_health_ailments/download_script.py
@@ -4,21 +4,14 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     https://www.apache.org/licenses/LICENSE-2.0
+#      https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""This script downloads data for India NSS Health Ailments from NSS Report No. 556.
-
-This script downloads data from the NDAP API, processes it, and saves it as a
-CSV file.
-
-How to run the script:
-python3 download_script.py
-"""
+"""This script downloads data for India NSS Health Ailments from NSS Report No. 556."""
 
 import json
 import os
@@ -38,10 +31,37 @@
     'gs://unresolved_mcf/india_ndap/NDAP_NSS_Health/latest/download_config.json',
     'Input directory where config files are downloaded.')
 
+# --- ROBUST PATH RESOLUTION START ---
 _SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(os.path.join(_SCRIPT_PATH, '../../../util/'))
-from download_util_script import _retry_method
-import file_util
+
+def _add_util_to_path():
+    """Adds the repo 'util' directory to sys.path dynamically."""
+    path_3_up = os.path.abspath(os.path.join(_SCRIPT_PATH, '../../../util/'))
+    path_2_up = os.path.abspath(os.path.join(_SCRIPT_PATH, '../../util/'))
+    
+    if os.path.exists(os.path.join(path_3_up, 'file_util.py')):
+        sys.path.append(path_3_up)
+    elif os.path.exists(os.path.join(path_2_up, 'file_util.py')):
+        sys.path.append(path_2_up)
+    else:
+        curr = _SCRIPT_PATH
+        for _ in range(5):
+            potential = os.path.join(curr, 'util')
+            if os.path.exists(os.path.join(potential, 'file_util.py')):
+                sys.path.append(potential)
+                return
+            curr = os.path.dirname(curr)
+        logging.error("Could not find 'util' directory containing file_util.py")
+
+_add_util_to_path()
+
+try:
+    from download_util_script import _retry_method
+    import file_util
+except ImportError as e:
+    logging.fatal(f"Import Error: Could not find utility scripts. {e}")
+# --- ROBUST PATH RESOLUTION END ---
+
 _OUTPUT_COLUMNS = [
     'srcStateName',
     'TRU',
@@ -58,64 +78,77 @@
     'Year',
 ]
 
+def load_config(path: str) -> dict:
+    """Loads configuration from GCS or local disk."""
+    if path.startswith('gs://'):
+        client = storage.Client()
+        bucket_name = path.split('/')[2]
+        blob_name = '/'.join(path.split('/')[3:])
+        blob = client.get_bucket(bucket_name).blob(blob_name)
+        return json.loads(blob.download_as_string())
+    return file_util.file_load_py_dict(path)
 
 def download_data(config_file_path: str) -> Tuple[List[Tuple], str]:
-  """Downloads and returns raw JSON data from a paginated API.
-
-  Args:
-    config_file_path: The GCS path to the config file.
-
-  Returns:
-    A tuple containing the downloaded data as a list of tuples and the output
-    directory.
-  """
-  file_config = file_util.file_load_py_dict(config_file_path)
-  url = file_config.get('url')
-  output_dir = file_config.get('input_files')
-  if not url:
-    return [], ''
-
-  all_data = []
-  page_num = 1
-  while True:
-    api_url = f'{url}&pageno={page_num}'
-    response = _retry_method(api_url, None, 3, 5, 2)
-    if not response:
-      logging.fatal('Failed to retrieve data from page %d', page_num)
-
-    try:
-      response_data = response.json()
-    except json.JSONDecodeError:
-      logging.error('Failed to parse JSON from page %d', page_num)
-      break
-
-    if response_data and 'Data' in response_data and response_data['Data']:
-      for item in response_data['Data']:
-        year = item['Year'].split(',')[-1].strip()
-        row = (
-            item['StateName'],
-            item['TRU'],
-            item['D7300_3'],
-            item['D7300_4'],
-            item['D7300_5'],
-            item['I7300_6']['TotalPopulationWeight'],
-            item['I7300_7']['avg'],
-            item['I7300_8']['avg'],
-            year,
-            str(int(year) + 1),
-            year,
-            item['Year'],
-        )
-        all_data.append(row)
-      page_num += 1
-    else:
-      logging.info('No more data found on page %d.', page_num)
-      break
-
-  return all_data, output_dir
-
+    """Downloads data using configuration."""
+    file_config = load_config(config_file_path)
+    url = file_config.get('url')
+    # We ignore the 'input_files' from config to save in the current directory
+    output_dir = '' 
+    
+    if not url:
+        return [], ''
+
+    all_data = []
+    page_num = 1
+    while True:
+        api_url = f'{url}&pageno={page_num}'
+        response = _retry_method(api_url, None, 3, 5, 2)
+        if not response:
+            logging.fatal('Failed to retrieve data from page %d', page_num)
+
+        try:
+            response_data = response.json()
+        except json.JSONDecodeError:
+            logging.error('Failed to parse JSON from page %d', page_num)
+            break
+
+        if response_data and 'Data' in response_data and response_data['Data']:
+            for item in response_data['Data']:
+                year = item['Year'].split(',')[-1].strip()
+                row = (
+                    item['StateName'],
+                    item['TRU'],
+                    item['D7300_3'],
+                    item['D7300_4'],
+                    item['D7300_5'],
+                    item['I7300_6']['TotalPopulationWeight'],
+                    item['I7300_7']['avg'],
+                    item['I7300_8']['avg'],
+                    year,
+                    str(int(year) + 1),
+                    year,
+                    item['Year'],
+                )
+                all_data.append(row)
+            page_num += 1
+        else:
+            logging.info('No more data found on page %d.', page_num)
+            break
+
+    return all_data, output_dir
 
 def preprocess_and_save(data: List[Tuple], output_dir: str) -> None:
+    """Saves data to CSV directly in the script directory."""
+    if not data:
+        logging.info('No data was retrieved from the API.')
+        return
+
+    df = pd.DataFrame(data, columns=_OUTPUT_COLUMNS)
+    
+    # Save directly in _SCRIPT_PATH (statvar_imports/india_ndap/india_nss_health_ailments)
+    output_path = os.path.join(_SCRIPT_PATH, 'india_nss_health_ailments.csv')  
+    df.to_csv(output_path, index=False)
+    logging.info('Data saved to %s', output_path)
   """Converts data to a DataFrame and saves it as a CSV file.
 
   Args:
@@ -135,11 +168,9 @@ def preprocess_and_save(data: List[Tuple], output_dir: str) -> None:
 
 
 def main(_) -> None:
-  """Main function to download, process, and save the data."""
-  raw_data, _ = download_data(_FLAGS.config_file_path)
-  if raw_data:
-    preprocess_and_save(raw_data, _SCRIPT_PATH)
-
+    raw_data, output_dir = download_data(_FLAGS.config_file_path)
+    if raw_data:
+        preprocess_and_save(raw_data, output_dir)
 
 if __name__ == '__main__':
-  app.run(main)
+    app.run(main)