new classes (to be consolidated later)

tcnichol · tcnichol · commit b600cc95478a · 2025-08-18T14:17:34.000-05:00
to make it easier to map datasets to collections and their parent hierarchcy, we now will generate a json that will help with lookup.

this will be run BEFORE the migration script
diff --git a/scripts/migration/dataset_collection_json.py b/scripts/migration/dataset_collection_json.py
@@ -0,0 +1,32 @@
+import os
+from datetime import datetime
+import json
+import requests
+from dotenv import dotenv_values
+
+try:
+    import tomllib  # Python 3.11+
+except ImportError:
+    import tomli as tomllib
+
+
+
+DEFAULT_PASSWORD = "Password123&"
+
+# Get the current timestamp
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+COLLECTIONS_FILE = "collections_datasets.json"
+
+def get_dataset_collections_map():
+    print("Getting collections and datasets from Clowder v1...")
+
+    with open(COLLECTIONS_FILE, "r") as jf:
+        data = json.load(jf)
+    print(f"Loaded {len(data)} collections from {COLLECTIONS_FILE}")
+    dataset_to_collection = {}
+
+    for collection, datasets in data.items():
+        for dataset in datasets:
+            dataset_to_collection[dataset] = collection
+    return dataset_to_collection
+
diff --git a/scripts/migration/get_collections.py b/scripts/migration/get_collections.py
@@ -0,0 +1,115 @@
+import os
+from datetime import datetime
+
+import requests
+from dotenv import dotenv_values
+
+try:
+    import tomllib  # Python 3.11+
+except ImportError:
+    import tomli as tomllib
+
+
+from scripts.migration.migrate_metadata_definitions import (
+    check_metadata_definition_exists,
+    get_clowder_v1_metadata_definitions,
+    post_metadata_definition,
+)
+
+# Configuration and Constants
+DEFAULT_PASSWORD = "Password123&"
+
+# Get the current timestamp
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+OUTPUT_FILE = "collections_ids.txt"
+
+# Load environment variables
+path_to_env = os.path.join(os.getcwd(),"scripts","migration", ".env")
+config = dotenv_values(dotenv_path=path_to_env)
+
+
+CLOWDER_V1 = config["CLOWDER_V1"]
+ADMIN_KEY_V1 = config["ADMIN_KEY_V1"]
+CLOWDER_V2 = config["CLOWDER_V2"]
+ADMIN_KEY_V2 = config["ADMIN_KEY_V2"]
+
+if not CLOWDER_V1 or not ADMIN_KEY_V1 or not CLOWDER_V2 or not ADMIN_KEY_V2:
+    print("MISSING SOME ENVIRONMENT VARIABLES")
+else:
+    print("WE HAVE THEM ALL")
+
+base_headers_v1 = {"X-API-key": ADMIN_KEY_V1}
+base_headers_v2 = {"X-API-key": ADMIN_KEY_V2}
+
+clowder_headers_v1 = {
+    **base_headers_v1,
+    "Content-type": "application/json",
+    "accept": "application/json",
+}
+
+clowder_headers_v2 = {
+    **base_headers_v2,
+    "Content-type": "application/json",
+    "accept": "application/json",
+}
+
+admin_user = {
+    "email": "admin@example.com",
+    "password": "admin",
+    "first_name": "admin",
+    "last_name": "admin",
+}
+
+def get_clowder_v1_top_level_collections(headers):
+    endpoint = f"{CLOWDER_V1}/api/collections/topLevelCollections?superAdmin=true"
+    response = requests.get(endpoint, headers=headers)
+    user_collections = response.json()
+    return user_collections
+
+def get_collection_v1_descendants(headers, collection_id):
+    descendant_ids = []
+
+    collection_endpoint = f"{CLOWDER_V1}/api/collections/{collection_id}"
+    response = requests.get(collection_endpoint, headers=headers, verify=False)
+    collection_json = response.json()
+    print(collection_json["child_collection_ids"])
+    if int(collection_json["childCollectionsCount"]) > 0:
+        child_collections_ids = collection_json["child_collection_ids"]
+        descendant_ids = child_collections_ids[5:-1].split(', ')
+        for i in range(0, len(descendant_ids)):
+            id = descendant_ids[i]
+            descendent_endpoint = f"{CLOWDER_V1}/api/collections/{id}"
+            descendent_response = requests.get(descendent_endpoint, headers=headers, verify=False)
+            descendent_json = descendent_response.json()
+            if int(descendent_json["childCollectionsCount"]) > 0:
+                sub_descendants = get_collection_v1_descendants(headers, id)
+                descendant_ids.extend(sub_descendants)
+    return descendant_ids
+
+def get_dataset_ids_in_v1_collection(headers, collection_id):
+    dataset_ids = []
+    collection_endpoint = f"{CLOWDER_V1}/api/collections/{collection_id}/datasets"
+    response = requests.get(collection_endpoint, headers=headers, verify=False)
+    datasets_json = response.json()
+    for dataset in datasets_json:
+        dataset_ids.append(dataset["id"])
+    return dataset_ids
+
+if __name__ == "__main__":
+    top_level_collections = get_clowder_v1_top_level_collections(clowder_headers_v1)
+    all_v1_collections = []
+    for collection in top_level_collections:
+        print(f"Getting descendents for collection {collection['name']} ({collection['id']})")
+        all_v1_collections.append(collection["id"])
+        if int(collection["childCollectionsCount"]) > 0:
+            descendant_ids = get_collection_v1_descendants(clowder_headers_v1, collection["id"])
+            all_v1_collections.extend(descendant_ids)
+            print(f"Added descendents for collection {collection['name']} ({collection['id']})")
+
+
+    print(f"TOTAL V1 COLLECTIONS TO MIGRATE: {len(all_v1_collections)}")
+
+    with open(OUTPUT_FILE, "w") as outfile:
+        for v1_collection in all_v1_collections:
+            outfile.write(v1_collection + "\n")
+    print(f"Migration complete. New users logged to {OUTPUT_FILE}")
diff --git a/scripts/migration/get_collections_datasets.py b/scripts/migration/get_collections_datasets.py
@@ -0,0 +1,97 @@
+import os
+from datetime import datetime
+import json
+import requests
+from dotenv import dotenv_values
+
+try:
+    import tomllib  # Python 3.11+
+except ImportError:
+    import tomli as tomllib
+
+
+
+DEFAULT_PASSWORD = "Password123&"
+
+# Get the current timestamp
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+COLLECTIONS_FILE = "collections_ids.txt"
+
+
+
+# Load environment variables
+path_to_env = os.path.join(os.getcwd(),"scripts","migration", ".env")
+config = dotenv_values(dotenv_path=path_to_env)
+
+
+CLOWDER_V1 = config["CLOWDER_V1"]
+ADMIN_KEY_V1 = config["ADMIN_KEY_V1"]
+CLOWDER_V2 = config["CLOWDER_V2"]
+ADMIN_KEY_V2 = config["ADMIN_KEY_V2"]
+
+if not CLOWDER_V1 or not ADMIN_KEY_V1 or not CLOWDER_V2 or not ADMIN_KEY_V2:
+    print("MISSING SOME ENVIRONMENT VARIABLES")
+else:
+    print("WE HAVE THEM ALL")
+
+base_headers_v1 = {"X-API-key": ADMIN_KEY_V1}
+base_headers_v2 = {"X-API-key": ADMIN_KEY_V2}
+
+clowder_headers_v1 = {
+    **base_headers_v1,
+    "Content-type": "application/json",
+    "accept": "application/json",
+}
+
+clowder_headers_v2 = {
+    **base_headers_v2,
+    "Content-type": "application/json",
+    "accept": "application/json",
+}
+
+admin_user = {
+    "email": "admin@example.com",
+    "password": "admin",
+    "first_name": "admin",
+    "last_name": "admin",
+}
+
+def get_collections_datasets(headers, collection_id):
+    collection_dataset_endpoint = (
+        f"{CLOWDER_V1}/api/collections/{collection_id}/datasets?superAdmin=true"
+    )
+    collection_dataset_response = requests.get(
+        collection_dataset_endpoint, headers=headers
+    )
+    collection_dataset_json = collection_dataset_response.json()
+    return collection_dataset_json
+
+
+if __name__ == "__main__":
+    print("Getting collections and datasets from Clowder v1...")
+
+    collection_ids =[]
+    if os.path.exists(COLLECTIONS_FILE):
+        print('exists')
+    else:
+        print('does not exist')
+
+    with open(COLLECTIONS_FILE, "r") as outfile:
+        lines = outfile.readlines()
+        for line in lines:
+            collection_ids.append(line.rstrip('\n'))
+    print(f"Found {len(collection_ids)} collections in {COLLECTIONS_FILE}")
+    collection_dataset_dict = dict()
+    for id in collection_ids:
+        print(f"Getting datasets for collection id {id}...")
+        datasets = get_collections_datasets(clowder_headers_v1, id)
+        if len(datasets) > 0:
+            dataset_ids = []
+            for ds in datasets:
+                dataset_ids.append(ds["id"])
+            collection_dataset_dict[id] = dataset_ids
+
+    json_file = "collections_datasets.json"
+    with open(json_file, "w") as jf:
+        json.dump(collection_dataset_dict, jf)
+    print("dumped to a file")
diff --git a/scripts/migration/migrate.py b/scripts/migration/migrate.py
@@ -156,10 +156,13 @@ def get_collection_v1_descendants(headers, collection_id):
         child_collections_ids = collection_json["child_collection_ids"]
         descendant_ids = child_collections_ids[5:-1].split(', ')
         for id in descendant_ids:
-            sub_descendants = get_collection_v1_descendants(headers, id)
-            descendant_ids.extend(sub_descendants)
+            descendent_endpoint = f"{CLOWDER_V1}/api/collections/{id}"
+            descendent_response = requests.get(descendent_endpoint, headers=headers, verify=False)
+            descendent_json = descendent_response.json()
+            if int(descendent_json["childCollectionsCount"]) > 0:
+                sub_descendants = get_collection_v1_descendants(headers, id)
+                descendant_ids.extend(sub_descendants)
     return descendant_ids
-    print('we got collection')
 
 def get_clowder_v1_user_collections(headers, user_v1):
     endpoint = f"{CLOWDER_V1}/api/collections"
@@ -169,23 +172,10 @@ def get_clowder_v1_user_collections(headers, user_v1):
 # TODO this is too slow, we need to optimize it
 def get_clowder_v1_dataset_collections(headers, user_v1, dataset_id):
     matching_collections = []
-    endpoint1 = f"{CLOWDER_V1}/api/collections/rootCollections?superAdmin=true"
-    # use this one below
-    endpint2 = f"{CLOWDER_V1}/api/collections/topLevelCollections?superAdmin=true"
-    response = requests.get(endpoint1, headers=headers)
-    response2 = requests.get(endpint2, headers=headers)
+    endpoint = f"{CLOWDER_V1}/api/collections/topLevelCollections?superAdmin=true"
+    response = requests.get(endpoint, headers=headers)
     user_collections = response.json()
-    user_collections_ids = []
-    user_collections_ids_2 = []
-    user_collections_2 = response2.json()
-    for collection in user_collections_2:
-        id = collection['id']
-        descendants = get_collection_v1_descendants(headers, id)
-        # test_descendants = get_collection_v1_descendants(headers, "68a34b28e4b0cc7386c091a4")
-        # TODO check here if the dataset is in a descendant
-        print('got descendants')
-    for collection in user_collections:
-        user_collections_ids.append(collection['id'])
+
     for collection in user_collections:
         collection_id = collection["id"]
         collection_dataset_endpoint = (
@@ -198,9 +188,24 @@ def get_clowder_v1_dataset_collections(headers, user_v1, dataset_id):
             datasets = dataset_response.json()
             for ds in datasets:
                 if ds["id"] == dataset_id:
-                    matching_collections.append(collection)
+                    if collection not in matching_collections:
+                        matching_collections.append(collection)
         except Exception as e:
             print("Exception", e)
+        if int(collection["childCollectionsCount"]) > 0:
+            collection_descendants = get_collection_v1_descendants(headers, collection_id)
+            for descendant in collection_descendants:
+                collection_dataset_endpoint = (
+                    f"{CLOWDER_V1}/api/collections/{descendant}/datasets?superAdmin=true"
+                )
+                collection_dataset_response = requests.get(
+                    collection_dataset_endpoint, headers=headers
+                )
+                collection_dataset_json = collection_dataset_response.json()
+                for ds in collection_dataset_json:
+                    if ds['id'] == dataset_id:
+                        if descendant not in matching_collections:
+                            matching_collections.append(descendant)
     return matching_collections
 
 
@@ -826,9 +831,12 @@ def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
                 if file_v2_id is not None:
                     add_file_metadata(file, file_v2_id, clowder_headers_v1, user_headers_v2)
             # posting the collection hierarchy as metadata
-            collection_space_metadata_dict = build_collection_space_metadata_for_v1_dataset(
-                dataset=dataset, user_v1=user_v1, headers=clowder_headers_v1
-            )
+            try:
+                collection_space_metadata_dict = build_collection_space_metadata_for_v1_dataset(
+                    dataset=dataset, user_v1=user_v1, headers=clowder_headers_v1
+                )
+            except Exception as e:
+                print(e)
             migration_extractor_collection_metadata = {
                 "listener": {
                     "name": "migration",